--- license: mit language: - en - de - fr - it - pt - hi - es - th base_model: - Qwen/Qwen2.5-Omni-7B pipeline_tag: any-to-any tags: - gptqmodel - FunAGI - Qwen - int4 --- This model has been 4-bit quantized Qwen2.5-Omni-7B model with [GPTQModel](https://github.com/ModelCloud/GPTQModel). - **bits**: 4 - **dynamic**: null - **group_size**: 128 - **desc_act**: true - **static_groups**: false - **sym**: false - **lm_head**: false - **true_sequential**: true - **quant_method**: "gptq" - **checkpoint_format**: "gptq" - **meta**: - **quantizer**: gptqmodel:1.1.0 - **uri**: https://github.com/modelcloud/gptqmodel - **damp_percent**: 0.1 - **damp_auto_increment**: 0.0015 # Model Size | Model Size | FP | 4-bit | |------------|--------|---------| | | 22.39G | 12.71G | # Qwen 2.5 Omni Model Loading Guide ## Installation According to the Qwen official [documentation](https://github.com/QwenLM/Qwen2.5-Omni/tree/main), follow these installation steps: ```bash pip uninstall transformers pip install git+https://github.com/huggingface/transformers@3a1ead0aabed473eafe527915eea8c197d424356 pip install accelerate pip install qwen-omni-utils[decord] ``` Install GPTQModel from the [GitHub repository](https://github.com/ModelCloud/GPTQModel) ## Loading Model Code ```python import os import json import torch import torch.nn.functional as F import numpy as np from PIL import Image from typing import Any, Dict, List, Optional, Tuple, Union from transformers import ( Qwen2_5OmniModel, Qwen2_5OmniProcessor, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer ) from transformers.utils.hub import cached_file from transformers.generation.utils import GenerateOutput from gptqmodel import GPTQModel, QuantizeConfig, BACKEND from gptqmodel.models.base import BaseGPTQModel from gptqmodel.models.auto import MODEL_MAP, SUPPORTED_MODELS from gptqmodel.models._const import CPU from datasets import load_dataset from qwen_omni_utils import process_mm_info class Qwen25OmniThiknerGPTQ(BaseGPTQModel): loader = Qwen2_5OmniModel base_modules = [ "thinker.model.embed_tokens", "thinker.model.norm", "token2wav", "thinker.audio_tower", "thinker.model.rotary_emb", "thinker.visual", "talker" ] pre_lm_head_norm_module = "thinker.model.norm" require_monkeypatch = False layers_node = "thinker.model.layers" layer_type = "Qwen2_5OmniDecoderLayer" layer_modules = [ ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], ["mlp.up_proj", "mlp.gate_proj"], ["mlp.down_proj"], ] def pre_quantize_generate_hook_start(self): self.thinker.visual = move_to(self.thinker.visual, device=self.quantize_config.device) self.thinker.audio_tower = move_to(self.thinker.audio_tower, device=self.quantize_config.device) def pre_quantize_generate_hook_end(self): self.thinker.visual = move_to(self.thinker.visual, device=CPU) self.thinker.audio_tower = move_to(self.thinker.audio_tower, device=CPU) def preprocess_dataset(self, sample: Dict) -> Dict: return sample MODEL_MAP["qwen2_5_omni"] = Qwen25OmniThiknerGPTQ SUPPORTED_MODELS.append("qwen2_5_omni") model_path = "/home/chentianqi/model/Qwen/Qwen2.5-Omni-7B-GPTQ-4bit" from types import MethodType @classmethod def patched_from_config(cls, config, *args, **kwargs): kwargs.pop("trust_remote_code", None) model = cls._from_config(config, **kwargs) spk_path = cached_file( model_path, "spk_dict.pt", subfolder=kwargs.pop("subfolder", None), cache_dir=kwargs.pop("cache_dir", None), force_download=kwargs.pop("force_download", False), proxies=kwargs.pop("proxies", None), resume_download=kwargs.pop("resume_download", None), local_files_only=kwargs.pop("local_files_only", False), token=kwargs.pop("use_auth_token", None), revision=kwargs.pop("revision", None), ) if spk_path is None: raise ValueError(f"Speaker dictionary not found at {spk_path}") model.load_speakers(spk_path) return model Qwen2_5OmniModel.from_config = patched_from_config # FP Model # model = Qwen2_5OmniModel.from_pretrained( # model_path, # torch_dtype=torch.bfloat16, # device_map="auto", # attn_implementation="flash_attention_2", # ) # GPTQ MODEL model = GPTQModel.load( model_path, device_map="cuda", torch_dtype=torch.float16, attn_implementation="flash_attention_2" ) ``` ## Testing Model ```python from qwen_omni_utils import process_mm_info processor = Qwen2_5OmniProcessor.from_pretrained(model_path) # @title inference function def inference(video_path, prompt, sys_prompt): messages = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "video", "video": video_path}, ] }, ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # image_inputs, video_inputs = process_vision_info([messages]) audios, images, videos = process_mm_info(messages, use_audio_in_video=False) inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True) inputs = inputs.to(model.device).to(model.dtype) output = model.generate(**inputs, use_audio_in_video=False, return_audio=False) text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False) return text video_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4" prompt = "Please trranslate the abstract of paper into Chinese." # display(Video(video_path, width=640, height=360)) ## Use a local HuggingFace model to inference. response = inference(video_path, prompt=prompt, sys_prompt="You are a helpful assistant.") print(response[0]) ``` ## Notes - The code provides both commented-out FP model loading and GPTQ model loading