prince-canuma commited on
Commit
f165a42
·
verified ·
1 Parent(s): dc81193

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,36 +1,32 @@
1
  ---
2
- base_model: moonshotai/Kimi-VL-A3B-Thinking
3
- license: mit
4
- pipeline_tag: text-generation
5
- library_name: mlx
 
 
 
 
 
 
 
 
 
6
  tags:
 
 
7
  - mlx
8
  ---
9
 
10
  # mlx-community/Kimi-VL-A3B-Thinking-4bit
11
-
12
- This model [mlx-community/Kimi-VL-A3B-Thinking-4bit](https://huggingface.co/mlx-community/Kimi-VL-A3B-Thinking-4bit) was
13
- converted to MLX format from [moonshotai/Kimi-VL-A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking)
14
- using mlx-lm version **0.22.4**.
15
-
16
  ## Use with mlx
17
 
18
  ```bash
19
- pip install mlx-lm
20
  ```
21
 
22
- ```python
23
- from mlx_lm import load, generate
24
-
25
- model, tokenizer = load("mlx-community/Kimi-VL-A3B-Thinking-4bit")
26
-
27
- prompt = "hello"
28
-
29
- if tokenizer.chat_template is not None:
30
- messages = [{"role": "user", "content": prompt}]
31
- prompt = tokenizer.apply_chat_template(
32
- messages, add_generation_prompt=True
33
- )
34
-
35
- response = generate(model, tokenizer, prompt=prompt, verbose=True)
36
  ```
 
1
  ---
2
+ license: other
3
+ license_name: qwen
4
+ license_link: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/LICENSE
5
+ pipeline_tag: image-text-to-text
6
+ library_name: transformers
7
+ base_model:
8
+ - OpenGVLab/InternViT-300M-448px-V2_5
9
+ - Qwen/Qwen2.5-0.5B
10
+ base_model_relation: merge
11
+ datasets:
12
+ - OpenGVLab/MMPR-v1.2
13
+ language:
14
+ - multilingual
15
  tags:
16
+ - internvl
17
+ - custom_code
18
  - mlx
19
  ---
20
 
21
  # mlx-community/Kimi-VL-A3B-Thinking-4bit
22
+ This model was converted to MLX format from [`moonshotai/Kimi-VL-A3B-Thinking`]() using mlx-vlm version **0.1.23**.
23
+ Refer to the [original model card](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking) for more details on the model.
 
 
 
24
  ## Use with mlx
25
 
26
  ```bash
27
+ pip install -U mlx-vlm
28
  ```
29
 
30
+ ```bash
31
+ python -m mlx_vlm.generate --model mlx-community/Kimi-VL-A3B-Thinking-4bit --max-tokens 100 --temperature 0.0 --prompt "Describe this image." --image <path_to_image>
 
 
 
 
 
 
 
 
 
 
 
 
32
  ```
chat_template.jinja ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- for message in messages -%}
2
+ {%- if loop.first and messages[0]['role'] != 'system' -%}
3
+ {{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}
4
+ {%- endif -%}
5
+ {%- if message['role'] == 'system' -%}
6
+ {{'<|im_system|>'}}
7
+ {%- endif -%}
8
+ {%- if message['role'] == 'user' -%}
9
+ {{'<|im_user|>'}}
10
+ {%- endif -%}
11
+ {%- if message['role'] == 'assistant' -%}
12
+ {{'<|im_assistant|>'}}
13
+ {%- endif -%}
14
+ {{- message['role'] -}}
15
+ {{'<|im_middle|>'}}
16
+ {%- if message['content'] is string -%}
17
+ {{- message['content'] + '<|im_end|>' -}}
18
+ {%- else -%}
19
+ {%- for content in message['content'] -%}
20
+ {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
21
+ {{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}
22
+ {%- else -%}
23
+ {{content['text']}}
24
+ {%- endif -%}
25
+ {%- endfor -%}
26
+ {{'<|im_end|>'}}
27
+ {%- endif -%}
28
+ {%- endfor -%}
29
+ {%- if add_generation_prompt -%}
30
+ {{'<|im_assistant|>assistant<|im_middle|>'}}
31
+ {%- endif -%}
config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "architectures": [
3
  "KimiVLForConditionalGeneration"
4
  ],
@@ -7,17 +9,60 @@
7
  "AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
8
  "AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
9
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "ignore_index": -100,
 
 
 
 
 
 
 
 
11
  "media_placeholder_token_id": 163605,
 
12
  "model_type": "kimi_vl",
 
 
 
 
 
 
 
 
 
 
 
13
  "quantization": {
14
  "group_size": 64,
15
  "bits": 4
16
  },
17
- "quantization_config": {
18
- "group_size": 64,
19
- "bits": 4
20
- },
 
 
 
 
21
  "text_config": {
22
  "vocab_size": 163840,
23
  "max_position_embeddings": 131072,
@@ -25,6 +70,7 @@
25
  "intermediate_size": 11264,
26
  "moe_intermediate_size": 1408,
27
  "num_hidden_layers": 27,
 
28
  "num_attention_heads": 16,
29
  "n_shared_experts": 2,
30
  "n_routed_experts": 64,
@@ -55,14 +101,154 @@
55
  "rope_scaling": null,
56
  "attention_bias": false,
57
  "attention_dropout": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  "bos_token_id": 163584,
59
  "pad_token_id": 163839,
60
  "eos_token_id": 163585,
61
- "torch_dtype": "bfloat16",
62
- "tie_word_embeddings": false
 
 
 
 
 
63
  },
 
 
64
  "tie_word_embeddings": false,
65
- "torch_dtype": "bfloat16",
66
- "transformers_version": "4.50.3",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  "vocab_size": 163840
68
  }
 
1
  {
2
+ "_attn_implementation_autoset": false,
3
+ "add_cross_attention": false,
4
  "architectures": [
5
  "KimiVLForConditionalGeneration"
6
  ],
 
9
  "AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
10
  "AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
11
  },
12
+ "bad_words_ids": null,
13
+ "begin_suppress_tokens": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "decoder_start_token_id": null,
18
+ "diversity_penalty": 0.0,
19
+ "do_sample": false,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": null,
23
+ "exponential_decay_length_penalty": null,
24
+ "finetuning_task": null,
25
+ "forced_bos_token_id": null,
26
+ "forced_eos_token_id": null,
27
+ "id2label": {
28
+ "0": "LABEL_0",
29
+ "1": "LABEL_1"
30
+ },
31
  "ignore_index": -100,
32
+ "is_decoder": false,
33
+ "is_encoder_decoder": false,
34
+ "label2id": {
35
+ "LABEL_0": 0,
36
+ "LABEL_1": 1
37
+ },
38
+ "length_penalty": 1.0,
39
+ "max_length": 20,
40
  "media_placeholder_token_id": 163605,
41
+ "min_length": 0,
42
  "model_type": "kimi_vl",
43
+ "no_repeat_ngram_size": 0,
44
+ "num_beam_groups": 1,
45
+ "num_beams": 1,
46
+ "num_return_sequences": 1,
47
+ "output_attentions": false,
48
+ "output_hidden_states": false,
49
+ "output_scores": false,
50
+ "pad_token_id": 0,
51
+ "prefix": null,
52
+ "problem_type": null,
53
+ "pruned_heads": {},
54
  "quantization": {
55
  "group_size": 64,
56
  "bits": 4
57
  },
58
+ "remove_invalid_values": false,
59
+ "repetition_penalty": 1.0,
60
+ "return_dict": true,
61
+ "return_dict_in_generate": false,
62
+ "sep_token_id": null,
63
+ "suppress_tokens": null,
64
+ "task_specific_params": null,
65
+ "temperature": 1.0,
66
  "text_config": {
67
  "vocab_size": 163840,
68
  "max_position_embeddings": 131072,
 
70
  "intermediate_size": 11264,
71
  "moe_intermediate_size": 1408,
72
  "num_hidden_layers": 27,
73
+ "num_nextn_predict_layers": 1,
74
  "num_attention_heads": 16,
75
  "n_shared_experts": 2,
76
  "n_routed_experts": 64,
 
101
  "rope_scaling": null,
102
  "attention_bias": false,
103
  "attention_dropout": 0.0,
104
+ "return_dict": true,
105
+ "output_hidden_states": false,
106
+ "output_attentions": false,
107
+ "torchscript": false,
108
+ "torch_dtype": "bfloat16",
109
+ "use_bfloat16": false,
110
+ "tf_legacy_loss": false,
111
+ "pruned_heads": {},
112
+ "tie_word_embeddings": false,
113
+ "chunk_size_feed_forward": 0,
114
+ "is_encoder_decoder": false,
115
+ "is_decoder": false,
116
+ "cross_attention_hidden_size": null,
117
+ "add_cross_attention": false,
118
+ "tie_encoder_decoder": false,
119
+ "max_length": 20,
120
+ "min_length": 0,
121
+ "do_sample": false,
122
+ "early_stopping": false,
123
+ "num_beams": 1,
124
+ "num_beam_groups": 1,
125
+ "diversity_penalty": 0.0,
126
+ "temperature": 1.0,
127
+ "top_k": 50,
128
+ "top_p": 1.0,
129
+ "typical_p": 1.0,
130
+ "repetition_penalty": 1.0,
131
+ "length_penalty": 1.0,
132
+ "no_repeat_ngram_size": 0,
133
+ "encoder_no_repeat_ngram_size": 0,
134
+ "bad_words_ids": null,
135
+ "num_return_sequences": 1,
136
+ "output_scores": false,
137
+ "return_dict_in_generate": false,
138
+ "forced_bos_token_id": null,
139
+ "forced_eos_token_id": null,
140
+ "remove_invalid_values": false,
141
+ "exponential_decay_length_penalty": null,
142
+ "suppress_tokens": null,
143
+ "begin_suppress_tokens": null,
144
+ "architectures": null,
145
+ "finetuning_task": null,
146
+ "id2label": {
147
+ "0": "LABEL_0",
148
+ "1": "LABEL_1"
149
+ },
150
+ "label2id": {
151
+ "LABEL_0": 0,
152
+ "LABEL_1": 1
153
+ },
154
+ "tokenizer_class": null,
155
+ "prefix": null,
156
  "bos_token_id": 163584,
157
  "pad_token_id": 163839,
158
  "eos_token_id": 163585,
159
+ "sep_token_id": null,
160
+ "decoder_start_token_id": null,
161
+ "task_specific_params": null,
162
+ "problem_type": null,
163
+ "_name_or_path": "",
164
+ "_attn_implementation_autoset": false,
165
+ "model_type": "deepseek_v3"
166
  },
167
+ "tf_legacy_loss": false,
168
+ "tie_encoder_decoder": false,
169
  "tie_word_embeddings": false,
170
+ "tokenizer_class": null,
171
+ "top_k": 50,
172
+ "top_p": 1.0,
173
+ "torchscript": false,
174
+ "transformers_version": "4.52.0.dev0",
175
+ "typical_p": 1.0,
176
+ "use_bfloat16": false,
177
+ "vision_config": {
178
+ "return_dict": true,
179
+ "output_hidden_states": false,
180
+ "output_attentions": false,
181
+ "torchscript": false,
182
+ "torch_dtype": "bfloat16",
183
+ "use_bfloat16": false,
184
+ "tf_legacy_loss": false,
185
+ "pruned_heads": {},
186
+ "tie_word_embeddings": true,
187
+ "chunk_size_feed_forward": 0,
188
+ "is_encoder_decoder": false,
189
+ "is_decoder": false,
190
+ "cross_attention_hidden_size": null,
191
+ "add_cross_attention": false,
192
+ "tie_encoder_decoder": false,
193
+ "max_length": 20,
194
+ "min_length": 0,
195
+ "do_sample": false,
196
+ "early_stopping": false,
197
+ "num_beams": 1,
198
+ "num_beam_groups": 1,
199
+ "diversity_penalty": 0.0,
200
+ "temperature": 1.0,
201
+ "top_k": 50,
202
+ "top_p": 1.0,
203
+ "typical_p": 1.0,
204
+ "repetition_penalty": 1.0,
205
+ "length_penalty": 1.0,
206
+ "no_repeat_ngram_size": 0,
207
+ "encoder_no_repeat_ngram_size": 0,
208
+ "bad_words_ids": null,
209
+ "num_return_sequences": 1,
210
+ "output_scores": false,
211
+ "return_dict_in_generate": false,
212
+ "forced_bos_token_id": null,
213
+ "forced_eos_token_id": null,
214
+ "remove_invalid_values": false,
215
+ "exponential_decay_length_penalty": null,
216
+ "suppress_tokens": null,
217
+ "begin_suppress_tokens": null,
218
+ "architectures": null,
219
+ "finetuning_task": null,
220
+ "id2label": {
221
+ "0": "LABEL_0",
222
+ "1": "LABEL_1"
223
+ },
224
+ "label2id": {
225
+ "LABEL_0": 0,
226
+ "LABEL_1": 1
227
+ },
228
+ "tokenizer_class": null,
229
+ "prefix": null,
230
+ "bos_token_id": null,
231
+ "pad_token_id": null,
232
+ "eos_token_id": null,
233
+ "sep_token_id": null,
234
+ "decoder_start_token_id": null,
235
+ "task_specific_params": null,
236
+ "problem_type": null,
237
+ "_name_or_path": "",
238
+ "_attn_implementation_autoset": false,
239
+ "model_type": "moonvit",
240
+ "patch_size": 14,
241
+ "init_pos_emb_height": 64,
242
+ "init_pos_emb_width": 64,
243
+ "num_hidden_layers": 27,
244
+ "num_attention_heads": 16,
245
+ "hidden_size": 1152,
246
+ "intermediate_size": 4304,
247
+ "merge_kernel_size": [
248
+ 2,
249
+ 2
250
+ ],
251
+ "skip_vision": true
252
+ },
253
  "vocab_size": 163840
254
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc1b4b9e7cdf18df281b554b721e8b3da7916fd36a7673b1396b83df64de08a7
3
- size 5284632734
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fa24384962d98e76557bbb00193e2ac40aa6456f617cbd79730eb63787c40a4
3
+ size 5356237611
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:861f9444d7ff364daa0ba01ea1d00f07c54fffc9ff780265caaeb7836b29522c
3
- size 3698140150
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63ce34dfdd6f4ca593e196cfffe6336c304984d7c870112c9df86b6f3b719433
3
+ size 4477574765
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
modeling_kimi_vl.py CHANGED
@@ -55,10 +55,8 @@ import torch.distributed as dist
55
  from torch.nn import CrossEntropyLoss
56
  from transformers.activations import GELUActivation, ACT2FN, PytorchGELUTanh
57
  from transformers.cache_utils import Cache, DynamicCache
58
- from transformers.modeling_utils import (
59
- PreTrainedModel,
60
- GenerationMixin,
61
- )
62
  from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
63
  from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
64
  from transformers.modeling_outputs import (
@@ -906,6 +904,7 @@ class MoEGate(nn.Module):
906
  self.n_routed_experts = config.n_routed_experts
907
  self.routed_scaling_factor = config.routed_scaling_factor
908
  self.scoring_func = config.scoring_func
 
909
  self.seq_aux = config.seq_aux
910
  self.topk_method = config.topk_method
911
  self.n_group = config.n_group
@@ -972,6 +971,10 @@ class MoEGate(nn.Module):
972
  ) # [n, e]
973
  _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
974
  topk_weight = scores.gather(1, topk_idx)
 
 
 
 
975
  else:
976
  raise NotImplementedError(
977
  f"insupportable TopK function for MoE gating: {self.topk_method}"
@@ -985,7 +988,57 @@ class MoEGate(nn.Module):
985
  topk_weight * self.routed_scaling_factor
986
  ) # must multiply the scaling factor
987
 
988
- return topk_idx, topk_weight
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989
 
990
 
991
  class DeepseekV3MoE(nn.Module):
@@ -1038,9 +1091,20 @@ class DeepseekV3MoE(nn.Module):
1038
  def forward(self, hidden_states):
1039
  identity = hidden_states
1040
  orig_shape = hidden_states.shape
1041
- topk_idx, topk_weight = self.gate(hidden_states)
1042
  hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
1043
- if not self.training:
 
 
 
 
 
 
 
 
 
 
 
1044
  y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
1045
  if self.config.n_shared_experts is not None:
1046
  y = y + self.shared_experts(identity)
 
55
  from torch.nn import CrossEntropyLoss
56
  from transformers.activations import GELUActivation, ACT2FN, PytorchGELUTanh
57
  from transformers.cache_utils import Cache, DynamicCache
58
+ from transformers.modeling_utils import PreTrainedModel
59
+ from transformers.generation.utils import GenerationMixin
 
 
60
  from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
61
  from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
62
  from transformers.modeling_outputs import (
 
904
  self.n_routed_experts = config.n_routed_experts
905
  self.routed_scaling_factor = config.routed_scaling_factor
906
  self.scoring_func = config.scoring_func
907
+ self.alpha = config.aux_loss_alpha
908
  self.seq_aux = config.seq_aux
909
  self.topk_method = config.topk_method
910
  self.n_group = config.n_group
 
971
  ) # [n, e]
972
  _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
973
  topk_weight = scores.gather(1, topk_idx)
974
+ elif self.topk_method == "greedy":
975
+ topk_weight, topk_idx = torch.topk(
976
+ scores, k=self.top_k, dim=-1, sorted=False
977
+ )
978
  else:
979
  raise NotImplementedError(
980
  f"insupportable TopK function for MoE gating: {self.topk_method}"
 
988
  topk_weight * self.routed_scaling_factor
989
  ) # must multiply the scaling factor
990
 
991
+ if self.training and self.alpha > 0.0:
992
+ scores_for_aux = scores
993
+ aux_topk = self.top_k
994
+ # always compute aux loss based on the naive greedy topk method
995
+ topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
996
+ if self.seq_aux:
997
+ scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
998
+ ce = torch.zeros(
999
+ bsz, self.n_routed_experts, device=hidden_states.device
1000
+ )
1001
+ ce.scatter_add_(
1002
+ 1,
1003
+ topk_idx_for_aux_loss,
1004
+ torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device),
1005
+ ).div_(seq_len * aux_topk / self.n_routed_experts)
1006
+ aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(
1007
+ dim=1
1008
+ ).mean() * self.alpha
1009
+ else:
1010
+ mask_ce = F.one_hot(
1011
+ topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts
1012
+ )
1013
+ ce = mask_ce.float().mean(0)
1014
+ Pi = scores_for_aux.mean(0)
1015
+ fi = ce * self.n_routed_experts
1016
+ aux_loss = (Pi * fi).sum() * self.alpha
1017
+ else:
1018
+ aux_loss = None
1019
+
1020
+ return topk_idx, topk_weight, aux_loss
1021
+
1022
+
1023
+ class AddAuxiliaryLoss(torch.autograd.Function):
1024
+ """
1025
+ The trick function of adding auxiliary (aux) loss,
1026
+ which includes the gradient of the aux loss during backpropagation.
1027
+ """
1028
+
1029
+ @staticmethod
1030
+ def forward(ctx, x, loss):
1031
+ assert loss.numel() == 1
1032
+ ctx.dtype = loss.dtype
1033
+ ctx.required_aux_loss = loss.requires_grad
1034
+ return x
1035
+
1036
+ @staticmethod
1037
+ def backward(ctx, grad_output):
1038
+ grad_loss = None
1039
+ if ctx.required_aux_loss:
1040
+ grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
1041
+ return grad_output, grad_loss
1042
 
1043
 
1044
  class DeepseekV3MoE(nn.Module):
 
1091
  def forward(self, hidden_states):
1092
  identity = hidden_states
1093
  orig_shape = hidden_states.shape
1094
+ topk_idx, topk_weight, aux_loss = self.gate(hidden_states)
1095
  hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
1096
+ if self.training:
1097
+ flat_topk_idx = topk_idx.view(-1)
1098
+ hidden_states = hidden_states.repeat_interleave(
1099
+ self.num_experts_per_tok, dim=0
1100
+ )
1101
+ y = torch.empty_like(hidden_states)
1102
+ for i, expert in enumerate(self.experts):
1103
+ y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
1104
+ y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
1105
+ y = y.to(hidden_states.dtype).view(*orig_shape)
1106
+ y = AddAuxiliaryLoss.apply(y, aux_loss)
1107
+ else:
1108
  y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
1109
  if self.config.n_shared_experts is not None:
1110
  y = y + self.shared_experts(identity)
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_kimi_vl.KimiVLImageProcessor",
4
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
5
+ },
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "KimiVLImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "in_token_limit": 4096,
18
+ "merge_kernel_size": [
19
+ 2,
20
+ 2
21
+ ],
22
+ "num_pooled_tokens": 1024,
23
+ "pad_input": true,
24
+ "patch_size": 14,
25
+ "processor_class": "KimiVLProcessor"
26
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
4
+ },
5
+ "processor_class": "KimiVLProcessor"
6
+ }
tokenization_moonshot.py CHANGED
@@ -16,6 +16,7 @@ from shutil import copyfile
16
  from tiktoken.load import load_tiktoken_bpe
17
  from tokenizers import AddedToken
18
  from transformers.tokenization_utils import PreTrainedTokenizer
 
19
  from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
20
 
21
 
@@ -229,6 +230,8 @@ class TikTokenTokenizer(PreTrainedTokenizer):
229
  if len(kwargs) > 0:
230
  return super().decode(token_ids, **kwargs)
231
 
 
 
232
  if type(token_ids) is int:
233
  token_ids = [token_ids]
234
 
 
16
  from tiktoken.load import load_tiktoken_bpe
17
  from tokenizers import AddedToken
18
  from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.utils import to_py_obj
20
  from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
21
 
22
 
 
230
  if len(kwargs) > 0:
231
  return super().decode(token_ids, **kwargs)
232
 
233
+ token_ids = to_py_obj(token_ids)
234
+
235
  if type(token_ids) is int:
236
  token_ids = [token_ids]
237
 
tokenizer_config.json CHANGED
@@ -117,18 +117,19 @@
117
  "<|media_pad|>"
118
  ],
119
  "auto_map": {
 
120
  "AutoTokenizer": [
121
  "tokenization_moonshot.TikTokenTokenizer",
122
  null
123
  ]
124
  },
125
  "bos_token": "[BOS]",
126
- "chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
127
  "clean_up_tokenization_spaces": false,
128
  "eos_token": "[EOS]",
129
  "extra_special_tokens": {},
130
  "model_max_length": 1048576,
131
  "pad_token": "[PAD]",
 
132
  "tokenizer_class": "TikTokenTokenizer",
133
  "unk_token": "[UNK]"
134
  }
 
117
  "<|media_pad|>"
118
  ],
119
  "auto_map": {
120
+ "AutoProcessor": "processing_kimi_vl.KimiVLProcessor",
121
  "AutoTokenizer": [
122
  "tokenization_moonshot.TikTokenTokenizer",
123
  null
124
  ]
125
  },
126
  "bos_token": "[BOS]",
 
127
  "clean_up_tokenization_spaces": false,
128
  "eos_token": "[EOS]",
129
  "extra_special_tokens": {},
130
  "model_max_length": 1048576,
131
  "pad_token": "[PAD]",
132
+ "processor_class": "KimiVLProcessor",
133
  "tokenizer_class": "TikTokenTokenizer",
134
  "unk_token": "[UNK]"
135
  }