File size: 5,129 Bytes
e1253c5
 
 
 
 
 
 
 
 
 
 
 
 
5993514
e1253c5
bd158f6
e1253c5
 
 
03848cc
e1253c5
8b034b2
 
 
 
 
 
 
 
 
 
 
 
03848cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd158f6
03848cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5993514
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
---
base_model: unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
tags:
- text-generation-inference
- transformers
- unsloth
- llama
- trl
license: apache-2.0
language:
- en
---

# Uploaded Model

- **Developed by:** AquaLabs
- **License:** apache-2.0
- **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit

This LlaMa model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.

# First, download requirements with following code:
```py
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
!pip install snac
```

# Pipeline
```py
import torch
from unsloth import FastLanguageModel
from snac import SNAC
from IPython.display import Audio, display
import numpy as np

TOKENISER_LENGTH = 128256
START_OF_TEXT = 128000
END_OF_TEXT = 128009
START_OF_HUMAN = TOKENISER_LENGTH + 3
END_OF_HUMAN = TOKENISER_LENGTH + 4
START_OF_AI = TOKENISER_LENGTH + 5
END_OF_AI = TOKENISER_LENGTH + 6

GEN_START_TOKEN = 128259
GEN_EOS_TOKEN = 128258  
GEN_END_EXTRA_TOKEN = 128260
GEN_REMOVE_TOKEN = 128258
CODE_OFFSET = 128266

def load_models(HF_TOKEN):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise",
        max_seq_length=2048,
        token=HF_TOKEN
    )
    FastLanguageModel.for_inference(model)
    
    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    snac_model = snac_model.to(device)
    
    return model, tokenizer, snac_model, device

def redistribute_codes(code_list, snac_model, device):
    layer_1, layer_2, layer_3 = [], [], []
    num_groups = len(code_list) // 7
    for i in range(num_groups):
        group = code_list[7 * i: 7 * i + 7]
        layer_1.append(group[0])
        layer_2.append(group[1] - 4096)
        layer_3.append(group[2] - (2 * 4096))
        layer_3.append(group[3] - (3 * 4096))
        layer_2.append(group[4] - (4 * 4096))
        layer_3.append(group[5] - (5 * 4096))
        layer_3.append(group[6] - (6 * 4096))
    codes = [
        torch.tensor(layer_1).unsqueeze(0).to(device),
        torch.tensor(layer_2).unsqueeze(0).to(device),
        torch.tensor(layer_3).unsqueeze(0).to(device)
    ]

    audio_waveform = snac_model.decode(codes)
    return audio_waveform

def tts_pipeline(prompt, model, tokenizer, snac_model, device):
    input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
    end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
    modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)
    
    attention_mask = torch.ones_like(modified_input_ids, device=device)
    
    generated_ids = model.generate(
        input_ids=modified_input_ids,
        attention_mask=attention_mask,
        max_new_tokens=1200,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.1,
        num_return_sequences=1,
        eos_token_id=GEN_EOS_TOKEN,
        use_cache=True
    )
    
    marker_token = 128257
    token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
    if len(token_indices[1]) > 0:
        last_marker = token_indices[1][-1].item()
        cropped_tensor = generated_ids[:, last_marker + 1:]
    else:
        cropped_tensor = generated_ids
    
    processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]
    
    row_length = processed_tokens.size(0)
    new_length = (row_length // 7) * 7
    trimmed_tokens = processed_tokens[:new_length]
 
    code_list = (trimmed_tokens - CODE_OFFSET).tolist()
    
    audio_waveform = redistribute_codes(code_list, snac_model, device)
    return audio_waveform

if __name__ == "__main__":
    HF_TOKEN = "YOUR_TOKEN"  
    model, tokenizer, snac_model, device = load_models(HF_TOKEN)
    
    prompt = "In the image, there is 2 man riding bike."
    audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device)

    audio_array = audio_output.detach().cpu().numpy()
    audio_array = np.squeeze(audio_array)

    if audio_array.ndim not in [1, 2]:
        raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape))

    display(Audio(audio_array, rate=24000))
    print("Audio generation complete.")
```

## Contributors

- Ahmet Erdem Pamuk - [GitHub](https://github.com/ahmeterdempmk) | [Hugging Face](https://huggingface.co/ahmeterdempmk)
- Emir Kaan Özdemir - [GitHub](https://github.com/emirkaanozdemr) | [Hugging Face](https://huggingface.co/emirkaanozdemr)
- Şuayp Talha Kocabay - [GitHub](https://github.com/suayptalha) | [Hugging Face](https://huggingface.co/suayptalha)

Details are provided in the [paper]().