Add files using upload-large-folder tool

Browse files

Files changed (5) hide show

README.md +8 -18
chat_template.jinja +7 -0
config.json +42 -2
generation_config.json +2 -2
tokenizer_config.json +4 -4

README.md CHANGED Viewed

@@ -1,31 +1,21 @@
 ---
-base_model: Qwen/Qwen2.5-VL-32B-Instruct
 license: apache-2.0
 language:
 - en
 pipeline_tag: image-text-to-text
 tags:
 - multimodal
 library_name: transformers
 ---
-<div>
-  <p style="margin-bottom: 0; margin-top: 0;">
-   <strong>See <a href="https://huggingface.co/collections/unsloth/qwen25-vl-all-versions-679ca6c784fad5bd976a05a1">our collection</a> for versions of Qwen2.5-VL including 4-bit & dynamic formats.</strong>
-  </p>
-  <div style="display: flex; gap: 5px; align-items: center; ">
-    <a href="https://github.com/unslothai/unsloth/">
-      <img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="133">
-    </a>
-    <a href="https://discord.gg/unsloth">
-      <img src="https://github.com/unslothai/unsloth/raw/main/images/Discord%20button.png" width="173">
-    </a>
-    <a href="https://docs.unsloth.ai/">
-      <img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation%20green%20button.png" width="143">
-    </a>
-  </div>
-</div>
 # Qwen2.5-VL-32B-Instruct
 ## Latest Updates:
 In addition to the original formula, we have further enhanced Qwen2.5-VL-32B's mathematical and problem-solving abilities through reinforcement learning. This has also significantly improved the model's subjective user experience, with response styles adjusted to better align with human preferences. Particularly for objective queries such as mathematics, logical reasoning, and knowledge-based Q&A, the level of detail in responses and the clarity of formatting have been noticeably enhanced.
@@ -62,7 +52,7 @@ We extend dynamic resolution to the temporal dimension by adopting dynamic FPS s
 We enhance both training and inference speeds by strategically implementing window attention into the ViT. The ViT architecture is further optimized with SwiGLU and RMSNorm, aligning it with the structure of the Qwen2.5 LLM.
-We have three models with 3, 7 and 72 billion parameters. This repo contains the instruction-tuned 32B Qwen2.5-VL model. For more information, visit our [Blog](https://qwenlm.github.io/blog/qwen2.5-vl/) and [GitHub](https://github.com/QwenLM/Qwen2.5-VL).

 ---
+base_model:
+- Qwen/Qwen2.5-VL-32B-Instruct
 license: apache-2.0
 language:
 - en
 pipeline_tag: image-text-to-text
 tags:
 - multimodal
+- unsloth
 library_name: transformers
 ---
 # Qwen2.5-VL-32B-Instruct
+<a href="https://chat.qwenlm.ai/" target="_blank" style="margin: 2px;">
+    <img alt="Chat" src="https://img.shields.io/badge/%F0%9F%92%9C%EF%B8%8F%20Qwen%20Chat%20-536af5" style="display: inline-block; vertical-align: middle;"/>
+</a>
 ## Latest Updates:
 In addition to the original formula, we have further enhanced Qwen2.5-VL-32B's mathematical and problem-solving abilities through reinforcement learning. This has also significantly improved the model's subjective user experience, with response styles adjusted to better align with human preferences. Particularly for objective queries such as mathematics, logical reasoning, and knowledge-based Q&A, the level of detail in responses and the clarity of formatting have been noticeably enhanced.
 We enhance both training and inference speeds by strategically implementing window attention into the ViT. The ViT architecture is further optimized with SwiGLU and RMSNorm, aligning it with the structure of the Qwen2.5 LLM.
+We have four models with 3, 7, 32 and 72 billion parameters. This repo contains the instruction-tuned 32B Qwen2.5-VL model. For more information, visit our [Blog](https://qwenlm.github.io/blog/qwen2.5-vl/) and [GitHub](https://github.com/QwenLM/Qwen2.5-VL).

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json CHANGED Viewed

@@ -9,7 +9,7 @@
   "image_token_id": 151655,
   "initializer_range": 0.02,
   "intermediate_size": 27648,
-  "max_position_embeddings": 32768,
   "max_window_layers": 64,
   "model_type": "qwen2_5_vl",
   "num_attention_heads": 40,
@@ -48,9 +48,48 @@
   },
   "rope_theta": 1000000.0,
   "sliding_window": 32768,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.50.3",
   "unsloth_fixed": true,
   "use_cache": true,
   "use_sliding_window": false,
@@ -67,6 +106,7 @@
     "hidden_size": 1280,
     "in_channels": 3,
     "in_chans": 3,
     "intermediate_size": 3456,
     "model_type": "qwen2_5_vl",
     "num_heads": 16,

   "image_token_id": 151655,
   "initializer_range": 0.02,
   "intermediate_size": 27648,
+  "max_position_embeddings": 128000,
   "max_window_layers": 64,
   "model_type": "qwen2_5_vl",
   "num_attention_heads": 40,
   },
   "rope_theta": 1000000.0,
   "sliding_window": 32768,
+  "text_config": {
+    "architectures": [
+      "Qwen2_5_VLForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "image_token_id": null,
+    "initializer_range": 0.02,
+    "intermediate_size": 27648,
+    "max_position_embeddings": 128000,
+    "max_window_layers": 64,
+    "model_type": "qwen2_5_vl_text",
+    "num_attention_heads": 40,
+    "num_hidden_layers": 64,
+    "num_key_value_heads": 8,
+    "pad_token_id": 151643,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": 32768,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "video_token_id": null,
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "vision_token_id": 151654,
+    "vocab_size": 152064
+  },
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.0.dev0",
   "unsloth_fixed": true,
   "use_cache": true,
   "use_sliding_window": false,
     "hidden_size": 1280,
     "in_channels": 3,
     "in_chans": 3,
+    "initializer_range": 0.02,
     "intermediate_size": 3456,
     "model_type": "qwen2_5_vl",
     "num_heads": 16,

generation_config.json CHANGED Viewed

@@ -5,9 +5,9 @@
     151645,
     151643
   ],
-  "max_length": 32768,
   "pad_token_id": 151654,
   "repetition_penalty": 1.05,
   "temperature": 1e-06,
-  "transformers_version": "4.50.3"
 }

     151645,
     151643
   ],
+  "max_length": 128000,
   "pad_token_id": 151654,
   "repetition_penalty": 1.05,
   "temperature": 1e-06,
+  "transformers_version": "4.52.0.dev0"
 }

tokenizer_config.json CHANGED Viewed

@@ -195,16 +195,16 @@
     "<|video_pad|>"
   ],
   "bos_token": null,
-  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
-  "model_max_length": 32768,
   "pad_token": "<|vision_pad|>",
   "padding_side": "left",
   "processor_class": "Qwen2_5_VLProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
-  "unk_token": null
-}

     "<|video_pad|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
+  "model_max_length": 128000,
   "pad_token": "<|vision_pad|>",
   "padding_side": "left",
   "processor_class": "Qwen2_5_VLProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}