ahmeterdempmk commited on
Commit
03848cc
·
verified ·
1 Parent(s): 2ef2219

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +116 -3
README.md CHANGED
@@ -11,12 +11,125 @@ language:
11
  - en
12
  ---
13
 
14
- # Uploaded model
15
 
16
  - **Developed by:** ahmeterdempmk
17
  - **License:** apache-2.0
18
  - **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
19
 
20
- This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
21
 
22
- [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  - en
12
  ---
13
 
14
+ # Uploaded Model
15
 
16
  - **Developed by:** ahmeterdempmk
17
  - **License:** apache-2.0
18
  - **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
19
 
20
+ This LlaMa model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
21
 
22
+ # Pipeline
23
+ ```py
24
+ import torch
25
+ from unsloth import FastLanguageModel
26
+ from snac import SNAC
27
+ from IPython.display import Audio, display
28
+ import numpy as np
29
+
30
+ TOKENISER_LENGTH = 128256
31
+ START_OF_TEXT = 128000
32
+ END_OF_TEXT = 128009
33
+ START_OF_HUMAN = TOKENISER_LENGTH + 3
34
+ END_OF_HUMAN = TOKENISER_LENGTH + 4
35
+ START_OF_AI = TOKENISER_LENGTH + 5
36
+ END_OF_AI = TOKENISER_LENGTH + 6
37
+
38
+ GEN_START_TOKEN = 128259
39
+ GEN_EOS_TOKEN = 128258
40
+ GEN_END_EXTRA_TOKEN = 128260
41
+ GEN_REMOVE_TOKEN = 128258
42
+ CODE_OFFSET = 128266
43
+
44
+ def load_models(HF_TOKEN):
45
+ model, tokenizer = FastLanguageModel.from_pretrained(
46
+ model_name="ahmeterdempmk/Orpheus-3B-0.1-ft-Elise",
47
+ max_seq_length=2048,
48
+ token=HF_TOKEN
49
+ )
50
+ FastLanguageModel.for_inference(model)
51
+
52
+ snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
53
+ device = "cuda" if torch.cuda.is_available() else "cpu"
54
+ snac_model = snac_model.to(device)
55
+
56
+ return model, tokenizer, snac_model, device
57
+
58
+ def redistribute_codes(code_list, snac_model, device):
59
+ layer_1, layer_2, layer_3 = [], [], []
60
+ num_groups = len(code_list) // 7
61
+ for i in range(num_groups):
62
+ group = code_list[7 * i: 7 * i + 7]
63
+ layer_1.append(group[0])
64
+ layer_2.append(group[1] - 4096)
65
+ layer_3.append(group[2] - (2 * 4096))
66
+ layer_3.append(group[3] - (3 * 4096))
67
+ layer_2.append(group[4] - (4 * 4096))
68
+ layer_3.append(group[5] - (5 * 4096))
69
+ layer_3.append(group[6] - (6 * 4096))
70
+ codes = [
71
+ torch.tensor(layer_1).unsqueeze(0).to(device),
72
+ torch.tensor(layer_2).unsqueeze(0).to(device),
73
+ torch.tensor(layer_3).unsqueeze(0).to(device)
74
+ ]
75
+
76
+ audio_waveform = snac_model.decode(codes)
77
+ return audio_waveform
78
+
79
+ def tts_pipeline(prompt, model, tokenizer, snac_model, device):
80
+ input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
81
+
82
+ start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
83
+ end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
84
+ modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)
85
+
86
+ attention_mask = torch.ones_like(modified_input_ids, device=device)
87
+
88
+ generated_ids = model.generate(
89
+ input_ids=modified_input_ids,
90
+ attention_mask=attention_mask,
91
+ max_new_tokens=1200,
92
+ do_sample=True,
93
+ temperature=0.6,
94
+ top_p=0.95,
95
+ repetition_penalty=1.1,
96
+ num_return_sequences=1,
97
+ eos_token_id=GEN_EOS_TOKEN,
98
+ use_cache=True
99
+ )
100
+
101
+ marker_token = 128257
102
+ token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
103
+ if len(token_indices[1]) > 0:
104
+ last_marker = token_indices[1][-1].item()
105
+ cropped_tensor = generated_ids[:, last_marker + 1:]
106
+ else:
107
+ cropped_tensor = generated_ids
108
+
109
+ processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]
110
+
111
+ row_length = processed_tokens.size(0)
112
+ new_length = (row_length // 7) * 7
113
+ trimmed_tokens = processed_tokens[:new_length]
114
+
115
+ code_list = (trimmed_tokens - CODE_OFFSET).tolist()
116
+
117
+ audio_waveform = redistribute_codes(code_list, snac_model, device)
118
+ return audio_waveform
119
+
120
+ if __name__ == "__main__":
121
+ HF_TOKEN = "YOUR_TOKEN"
122
+ model, tokenizer, snac_model, device = load_models(HF_TOKEN)
123
+
124
+ prompt = "In the image, there is 2 man riding bike."
125
+ audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device)
126
+
127
+ audio_array = audio_output.detach().cpu().numpy()
128
+ audio_array = np.squeeze(audio_array)
129
+
130
+ if audio_array.ndim not in [1, 2]:
131
+ raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape))
132
+
133
+ display(Audio(audio_array, rate=24000))
134
+ print("Audio generation complete.")
135
+ ```