test the inference script and quantized model, but have error as below
INFO ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
INFO ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.
from_quantized: adapter: None
INFO Estimated Quantization BPW (bits per weight): 4.2875 bpw, based on [bits: 4, group_size: 128]
Traceback (most recent call last):
File "D:\AITest\Qwen25Omni7B\web_demo_q4_me.py", line 494, in
model, processor = _load_model_processor(args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\AITest\Qwen25Omni7B\web_demo_q4_me.py", line 114, in _load_model_processor
model = GPTQModel.load(args.checkpoint_path, device_map="cuda", torch_dtype=torch.float16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\AITest\Qwen25Omni7B\Python311\Lib\site-packages\gptqmodel\models\auto.py", line 243, in load
return cls.from_quantized(
^^^^^^^^^^^^^^^^^^^
File "D:\AITest\Qwen25Omni7B\Python311\Lib\site-packages\gptqmodel\models\auto.py", line 313, in from_quantized
return MODEL_MAP[model_type].from_quantized(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\AITest\Qwen25Omni7B\Python311\Lib\site-packages\gptqmodel\models\loader.py", line 431, in from_quantized
model = cls.loader.from_config(
^^^^^^^^^^^^^^^^^^^^^^^
TypeError: patched_from_config() missing 1 required positional argument: 'config'
Press any key to continue . . .
I try set the "model_path" and "checkpoint_path" same as ref path like "checkpoints/Qwen2.5-Omni-7B-GPTQ-4bit" or abs path like "D:\AITest\Qwen25Omni7B\checkpoints\Qwen2.5-Omni-7B-GPTQ-4bit", but the error message is the same as above.
I check the "D:\AITest\Qwen25Omni7B\Python311\Lib\site-packages\gptqmodel\models\loader.py", find:
......
model_local_path = get_model_local_path(pretrained_model_id_or_path, **model_init_kwargs)
......
config = AutoConfig.from_pretrained(model_local_path, **model_init_kwargs)
......
is this may be the script can't find the "config.json" file under the checkpoints directory?
Thank you for your attention. The issue arises because Qwen2.5-Omni does not implement the classmethod from_config, whereas GPTQModel relies on this function to load a quantized model (see this link[https://github.com/ModelCloud/GPTQModel/blob/c7bfc40a6d95212c5afd1509e504eaf4ef41fa0f/gptqmodel/models/loader.py#L431C1-L434C35]). To address this, we’ve added a patch that implements the function.
Could you please add a print statement for the config before model = cls.loader.from_config to help us check why the config is not being passed as a positional argument?
I encountered the same problem, and the error was the same, but the above method did not solve the problem:
INFO ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
INFO ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.
INFO:qwen_omni_utils.v2_5.vision_process:set VIDEO_TOTAL_PIXELS: 90316800
from_quantized: adapter: None
INFO Estimated Quantization BPW (bits per weight): 4.2875 bpw, based on [bits: 4, group_size: 128]
Traceback (most recent call last):
File "/mnt/e/ai/qwen/qwen2.5-omni/run_model.py", line 102, in <module>
model = GPTQModel.load(
^^^^^^^^^^^^^^^
File "/home/qqb/miniconda3/envs/qwen25o/lib/python3.12/site-packages/gptqmodel/models/auto.py", line 237, in load
return cls.from_quantized(
^^^^^^^^^^^^^^^^^^^
File "/home/qqb/miniconda3/envs/qwen25o/lib/python3.12/site-packages/gptqmodel/models/auto.py", line 307, in from_quantized
return MODEL_MAP[model_type].from_quantized(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/qqb/miniconda3/envs/qwen25o/lib/python3.12/site-packages/gptqmodel/models/loader.py", line 431, in from_quantized
model = cls.loader.from_config(
^^^^^^^^^^^^^^^^^^^^^^^
TypeError: patched_from_config() missing 1 required positional argument: 'config'
Here is the code:
import os
import json
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image
from typing import Any, Dict, List, Optional, Tuple, Union
from transformers import (
Qwen2_5OmniModel,
Qwen2_5OmniProcessor,
AutoModelForVision2Seq,
AutoProcessor,
AutoTokenizer
)
from transformers.utils.hub import cached_file
from transformers.generation.utils import GenerateOutput
from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
from gptqmodel.models.base import BaseGPTQModel
from gptqmodel.models.auto import MODEL_MAP, SUPPORTED_MODELS
from gptqmodel.models._const import CPU
from datasets import load_dataset
from qwen_omni_utils import process_mm_info
class Qwen25OmniThiknerGPTQ(BaseGPTQModel):
loader = Qwen2_5OmniModel
base_modules = [
"thinker.model.embed_tokens",
"thinker.model.norm",
"token2wav",
"thinker.audio_tower",
"thinker.model.rotary_emb",
"thinker.visual",
"talker"
]
pre_lm_head_norm_module = "thinker.model.norm"
require_monkeypatch = False
layers_node = "thinker.model.layers"
layer_type = "Qwen2_5OmniDecoderLayer"
layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"],
]
def pre_quantize_generate_hook_start(self):
self.thinker.visual = move_to(self.thinker.visual, device=self.quantize_config.device)
self.thinker.audio_tower = move_to(self.thinker.audio_tower, device=self.quantize_config.device)
def pre_quantize_generate_hook_end(self):
self.thinker.visual = move_to(self.thinker.visual, device=CPU)
self.thinker.audio_tower = move_to(self.thinker.audio_tower, device=CPU)
def preprocess_dataset(self, sample: Dict) -> Dict:
return sample
MODEL_MAP["qwen2_5_omni"] = Qwen25OmniThiknerGPTQ
SUPPORTED_MODELS.append("qwen2_5_omni")
model_path = "/mnt/e/ai/qwen/qwen2.5-omni/Qwen2.5-Omni-7B-GPTQ-4bit"
from types import MethodType
def patched_from_config(cls, config, *args, **kwargs):
kwargs.pop("trust_remote_code", None)
model_path = "/mnt/e/ai/qwen/qwen2.5-omni/Qwen2.5-Omni-7B-GPTQ-4bit"
model = cls._from_config(config, **kwargs)
spk_path = cached_file(
model_path,
"spk_dict.pt",
subfolder=kwargs.pop("subfolder", None),
cache_dir=kwargs.pop("cache_dir", None),
force_download=kwargs.pop("force_download", False),
proxies=kwargs.pop("proxies", None),
resume_download=kwargs.pop("resume_download", None),
local_files_only=kwargs.pop("local_files_only", False),
token=kwargs.pop("use_auth_token", None),
revision=kwargs.pop("revision", None),
)
if spk_path is None:
raise ValueError(f"Speaker dictionary not found at {spk_path}")
model.load_speakers(spk_path)
return model
Qwen2_5OmniModel.from_config = patched_from_config
# FP Model
# model = Qwen2_5OmniModel.from_pretrained(
# model_path,
# torch_dtype=torch.bfloat16,
# device_map="auto",
# attn_implementation="flash_attention_2",
# )
# GPTQ MODEL
model = GPTQModel.load(
model_path,
device_map="cuda",
torch_dtype=torch.float16,
attn_implementation="flash_attention_2"
)
from qwen_omni_utils import process_mm_info
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)
#
@title
inference function
def inference(video_path, prompt, sys_prompt):
messages = [
{"role": "system", "content": sys_prompt},
{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "video", "video": video_path},
]
},
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# image_inputs, video_inputs = process_vision_info([messages])
audios, images, videos = process_mm_info(messages, use_audio_in_video=False)
inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True)
inputs = inputs.to(model.device).to(model.dtype)
output = model.generate(**inputs, use_audio_in_video=False, return_audio=False)
text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
return text
video_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4"
prompt = "Please trranslate the abstract of paper into Chinese."
# display(Video(video_path, width=640, height=360))
## Use a local HuggingFace model to inference.
response = inference(video_path, prompt=prompt, sys_prompt="You are a helpful assistant.")
print(response[0])
Sorry for the late reply. The patched_from_config method is missing the @classmethod decorator. @quqibing
Can this use all the capacities of the model on a 3090 with 24 gb vram?
Thank you for your work on this is am quite excited to try it out.