|
--- |
|
license: mit |
|
language: |
|
- en |
|
- de |
|
- fr |
|
- it |
|
- pt |
|
- hi |
|
- es |
|
- th |
|
base_model: |
|
- GSAI-ML/LLaDA-8B-Base |
|
pipeline_tag: text-generation |
|
tags: |
|
- gptqmodel |
|
- FunAGI |
|
- llada |
|
- int4 |
|
--- |
|
|
|
|
|
This model has been 4-bit quantized Llada-8B-Base model with [GPTQModel](https://github.com/ModelCloud/GPTQModel). |
|
|
|
- **bits**: 4 |
|
- **dynamic**: null |
|
- **group_size**: 128 |
|
- **desc_act**: true |
|
- **static_groups**: false |
|
- **sym**: false |
|
- **lm_head**: false |
|
- **true_sequential**: true |
|
- **quant_method**: "gptq" |
|
- **checkpoint_format**: "gptq" |
|
- **meta**: |
|
- **quantizer**: gptqmodel:1.1.0 |
|
- **uri**: https://github.com/modelcloud/gptqmodel |
|
- **damp_percent**: 0.1 |
|
- **damp_auto_increment**: 0.0015 |
|
|
|
## Benchmark |
|
### Performance of Quantized Models |
|
|
|
| Dataset | GPTQ-4bit | FP16 | |
|
|----------------|-------------|------| |
|
| mmlu | 65.20 | 65.9 | |
|
| cmmlu | 69.23 | 69.9 | |
|
| arc_challenge | 45.48 | 47.9(0) | |
|
|
|
## Example: |
|
```python |
|
|
|
# Copyright 2024-2025 ModelCloud.ai |
|
# Copyright 2024-2025 [email protected] |
|
# Contact: [email protected], x.com/qubitium |
|
# |
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
# you may not use this file except in compliance with the License. |
|
# You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
|
|
import torch |
|
from datasets import load_dataset |
|
from gptqmodel import GPTQModel, QuantizeConfig, BACKEND |
|
from gptqmodel.models.base import BaseGPTQModel |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel |
|
from gptqmodel.models.auto import MODEL_MAP |
|
import torch.nn.functional as F |
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
pretrained_model_id = '/home/chentianqi/model/GSAI-ML/LLaDA-8B-Base' # "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
|
quantized_model_id = "FunAGI/LLaDA-8B-Base-gptqmodel-4bit" |
|
|
|
|
|
|
|
class LladaGPTQ(BaseGPTQModel): |
|
# Non-repeating layers at the root level: same level as `layers_node` |
|
# Excluding `layers_node`. |
|
base_modules = ["model.transformer.wte", "model.transformer.ln_f"] |
|
pre_lm_head_norm_module = "model.transformer.ln_f" |
|
lm_head = "model.transformer.ff_out" |
|
# Below describes all the repeating layers in this transformer model |
|
# `model.layers` is a node/module that hold all the repeating layers. The parent node for all n-layers. |
|
layers_node = "model.transformer.blocks" |
|
# Each repeating layer in `model.layers` is of type `LlamaDecoderLayer` |
|
layer_type = "LLaDALlamaBlock" |
|
# Inside each `LlamaDecoderLayer` layer are many internal modules |
|
# List them in the order executed in model forward() code |
|
# Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections |
|
layer_modules = [ |
|
["attn_out", "k_proj", "v_proj", "q_proj"], |
|
["ff_proj", "up_proj"], |
|
["ff_out"], |
|
] |
|
MODEL_MAP ["llada"] = LladaGPTQ |
|
|
|
# os.makedirs(quantized_model_dir, exist_ok=True) |
|
def get_wikitext2(tokenizer, nsamples, seqlen): |
|
traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train").filter( |
|
lambda x: len(x["text"]) >= seqlen) |
|
|
|
return [tokenizer(example["text"]) for example in traindata.select(range(nsamples))] |
|
|
|
|
|
@torch.no_grad() |
|
def calculate_avg_ppl(model, tokenizer): |
|
from gptqmodel.utils import Perplexity |
|
|
|
ppl = Perplexity( |
|
model=model, |
|
tokenizer=tokenizer, |
|
dataset_path="wikitext", |
|
dataset_name="wikitext-2-raw-v1", |
|
split="train", |
|
text_column="text", |
|
) |
|
|
|
all = ppl.calculate(n_ctx=512, n_batch=512) |
|
|
|
# average ppl |
|
avg = sum(all) / len(all) |
|
|
|
return avg |
|
|
|
dynamic = { |
|
|
|
} |
|
|
|
def add_gumbel_noise(logits, temperature): |
|
''' |
|
The Gumbel max is a method for sampling categorical distributions. |
|
According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality. |
|
Thus, we use float64. |
|
''' |
|
logits = logits.to(torch.float64) |
|
noise = torch.rand_like(logits, dtype=torch.float64) |
|
gumbel_noise = (- torch.log(noise)) ** temperature |
|
return logits.exp() / gumbel_noise |
|
|
|
|
|
def get_num_transfer_tokens(mask_index, steps): |
|
''' |
|
In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals. |
|
Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)), |
|
the expected number of tokens transitioned at each step should be consistent. |
|
|
|
This function is designed to precompute the number of tokens that need to be transitioned at each step. |
|
''' |
|
mask_num = mask_index.sum(dim=1, keepdim=True) # |
|
|
|
base = mask_num // steps |
|
remainder = mask_num % steps |
|
|
|
num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base |
|
|
|
for i in range(mask_num.size(0)): |
|
num_transfer_tokens[i, :remainder[i]] += 1 |
|
|
|
return num_transfer_tokens |
|
|
|
def forward_process(batch, prompt_index, mask_id): |
|
b, l = batch.shape |
|
|
|
target_len = (l - prompt_index.sum()).item() |
|
k = torch.randint(1, target_len + 1, (), device=batch.device) |
|
|
|
x = torch.round(torch.linspace(float(k), k + (b - 1) * (target_len / b), steps=b, device=batch.device)).long() |
|
x = ((x - 1) % target_len) + 1 |
|
assert x.min() >= 1 and x.max() <= target_len |
|
|
|
indices = torch.arange(target_len, device=batch.device).repeat(b, 1) |
|
is_mask = indices < x.unsqueeze(1) |
|
for i in range(b): |
|
is_mask[i] = is_mask[i][torch.randperm(target_len)] |
|
|
|
is_mask = torch.cat((torch.zeros(b, prompt_index.sum(), dtype=torch.bool, device=batch.device), is_mask), dim=1) |
|
noisy_batch = torch.where(is_mask, mask_id, batch) |
|
|
|
# Return the masked batch and the mask ratio |
|
return noisy_batch, (x / target_len).unsqueeze(1).repeat(1, l) |
|
|
|
|
|
def get_logits(model, batch, prompt_index, cfg_scale, mask_id): |
|
if cfg_scale > 0.: |
|
assert len(prompt_index) == batch.shape[1] |
|
prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1) |
|
un_batch = batch.clone() |
|
un_batch[prompt_index] = mask_id |
|
batch = torch.cat([batch, un_batch]) |
|
|
|
input = batch |
|
logits = model(input).logits |
|
|
|
if cfg_scale > 0.: |
|
logits, un_logits = torch.chunk(logits, 2, dim=0) |
|
logits = un_logits + (cfg_scale + 1) * (logits - un_logits) |
|
return logits |
|
|
|
|
|
|
|
@ torch.no_grad() |
|
def get_log_likelihood(model, prompt, answer, mc_num=128, batch_size=32, cfg_scale=0., mask_id=126336): |
|
''' |
|
Args: |
|
model: Mask predictor. |
|
prompt: A tensor of shape (l1). |
|
answer: A tensor of shape (l2). |
|
mc_num: Monte Carlo estimation times. |
|
As detailed in Appendix B.5. Since MMLU, CMMLU, and C-EVAL only require the likelihood of a single token, a |
|
single Monte Carlo estimate is sufficient for these benchmarks. For all other benchmarks, we find that 128 |
|
Monte Carlo samples are adequate to produce stable results. |
|
batch_size: Mini batch size. |
|
cfg_scale: Unsupervised classifier-free guidance scale. |
|
mask_id: The toke id of [MASK] is 126336. |
|
''' |
|
|
|
seq = torch.concatenate([prompt, answer])[None, :] |
|
seq = seq.repeat((batch_size, 1)).to(model.device) |
|
prompt_index = torch.arange(seq.shape[1], device=model.device) < len(prompt) |
|
|
|
loss_ = [] |
|
for _ in range(mc_num // batch_size): |
|
|
|
perturbed_seq, p_mask = forward_process(seq, prompt_index, mask_id) |
|
mask_index = perturbed_seq == mask_id |
|
|
|
logits = get_logits(model, perturbed_seq, prompt_index, cfg_scale, mask_id) |
|
|
|
loss = F.cross_entropy(logits[mask_index], seq[mask_index], reduction='none') / p_mask[mask_index] |
|
loss = loss.sum() / batch_size |
|
|
|
loss_.append(loss.item()) |
|
|
|
return - sum(loss_) / len(loss_) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ torch.no_grad() |
|
def generate(model, prompt, steps=128, gen_length=128, block_length=128, temperature=0., |
|
cfg_scale=0., remasking='low_confidence', mask_id=126336): |
|
''' |
|
Args: |
|
model: Mask predictor. |
|
prompt: A tensor of shape (1, l). |
|
steps: Sampling steps, less than or equal to gen_length. |
|
gen_length: Generated answer length. |
|
block_length: Block length, less than or equal to gen_length. If less than gen_length, it means using semi_autoregressive remasking. |
|
temperature: Categorical distribution sampling temperature. |
|
cfg_scale: Unsupervised classifier-free guidance scale. |
|
remasking: Remasking strategy. 'low_confidence' or 'random'. |
|
mask_id: The toke id of [MASK] is 126336. |
|
''' |
|
x = torch.full((1, prompt.shape[1] + gen_length), mask_id, dtype=torch.long).to(model.device) |
|
x[:, :prompt.shape[1]] = prompt.clone() |
|
|
|
prompt_index = (x != mask_id) |
|
|
|
assert gen_length % block_length == 0 |
|
num_blocks = gen_length // block_length |
|
|
|
assert steps % num_blocks == 0 |
|
steps = steps // num_blocks |
|
|
|
for num_block in range(num_blocks): |
|
block_mask_index = (x[:, prompt.shape[1] + num_block * block_length: prompt.shape[1] + (num_block + 1) * block_length:] == mask_id) |
|
num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps) |
|
for i in range(steps): |
|
|
|
mask_index = (x == mask_id) |
|
if cfg_scale > 0.: |
|
un_x = x.clone() |
|
un_x[prompt_index] = mask_id |
|
x_ = torch.cat([x, un_x], dim=0) |
|
logits = model(x_).logits |
|
logits, un_logits = torch.chunk(logits, 2, dim=0) |
|
logits = un_logits + (cfg_scale + 1) * (logits - un_logits) |
|
else: |
|
logits = model(x).logits |
|
|
|
logits_with_noise = add_gumbel_noise(logits, temperature=temperature) |
|
x0 = torch.argmax(logits_with_noise, dim=-1) # b, l |
|
|
|
if remasking == 'low_confidence': |
|
p = F.softmax(logits.to(torch.float64), dim=-1) |
|
x0_p = torch.squeeze( |
|
torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l |
|
elif remasking == 'random': |
|
x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device) |
|
else: |
|
raise NotImplementedError(remasking) |
|
|
|
x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf |
|
|
|
x0 = torch.where(mask_index, x0, x) |
|
confidence = torch.where(mask_index, x0_p, -np.inf) |
|
|
|
transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device) |
|
for j in range(confidence.shape[0]): |
|
_, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i]) |
|
transfer_index[j, select_index] = True |
|
x[transfer_index] = x0[transfer_index] |
|
|
|
return x |
|
|
|
def main(): |
|
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=False) |
|
|
|
traindataset = get_wikitext2(tokenizer, nsamples=128, seqlen=1024) |
|
|
|
quantize_config = QuantizeConfig( |
|
dynamic=dynamic, |
|
bits=8, # quantize model to 4-bit |
|
group_size=128, # it is recommended to set the value to 128, |
|
desc_act = True, |
|
sym=False |
|
) |
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
prompt = "Question: Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours? The answer: " |
|
|
|
# # Add special tokens for the Instruct model. The Base model does not require the following two lines. |
|
# m = [{"role": "user", "content": prompt}, ] |
|
# prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) |
|
|
|
input_ids = tokenizer(prompt)['input_ids'] |
|
input_ids = torch.tensor(input_ids).to(device).unsqueeze(0) |
|
|
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
model = GPTQModel.load(quantized_model_id, device=device , trust_remote_code=True ) |
|
|
|
steps=128 |
|
out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence') |
|
print("*"*30+ f"GPTQ-4bit Steps {steps}"+ "*"*30) |
|
print(input_ids.shape) |
|
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0]) |
|
del model |
|
|
|
model =AutoModel.from_pretrained(pretrained_model_id, trust_remote_code=True ).cuda() |
|
|
|
out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence') |
|
print("*"*30+ f"FP16 Steps {steps}"+ "*"*30) |
|
print(input_ids.shape) |
|
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0]) |
|
|
|
|
|
if __name__ == "__main__": |
|
import logging |
|
|
|
logging.basicConfig( |
|
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", |
|
level=logging.INFO, |
|
datefmt="%Y-%m-%d %H:%M:%S", |
|
) |
|
|
|
main() |
|
|
|
|
|
|
|
``` |