Upload quantize.py with huggingface_hub
Browse files- quantize.py +111 -26
quantize.py
CHANGED
@@ -1,33 +1,118 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
from
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
7 |
|
8 |
# Load model.
|
9 |
-
|
10 |
-
model =
|
11 |
-
|
12 |
-
|
13 |
-
# Configure the quantization algorithm and scheme.
|
14 |
-
# In this case, we:
|
15 |
-
# * quantize the weights to fp8 with per channel via ptq
|
16 |
-
# * quantize the activations to fp8 with dynamic per token
|
17 |
-
recipe = QuantizationModifier(
|
18 |
-
targets="Linear",
|
19 |
-
scheme="FP8_DYNAMIC",
|
20 |
-
ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"],
|
21 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Confirm generations of the quantized model look sane.
|
29 |
print("========== SAMPLE GENERATION ==============")
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import torch
|
4 |
+
from PIL import Image
|
5 |
+
from transformers import AutoProcessor
|
6 |
+
from datasets import load_dataset
|
7 |
+
from llmcompressor import oneshot
|
8 |
+
from llmcompressor.modifiers.quantization import GPTQModifier
|
9 |
+
from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
|
10 |
|
11 |
# Load model.
|
12 |
+
model_id = "llama-joycaption-beta-one-hf-llava"
|
13 |
+
model = TraceableLlavaForConditionalGeneration.from_pretrained(
|
14 |
+
model_id, device_map="auto", torch_dtype="bfloat16"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
)
|
16 |
+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
17 |
+
|
18 |
+
# Oneshot arguments
|
19 |
+
DATASET_ID = "lmms-lab/flickr30k"
|
20 |
+
DATASET_SPLIT = "test"
|
21 |
+
NUM_CALIBRATION_SAMPLES = 512
|
22 |
+
MAX_SEQUENCE_LENGTH = 2048
|
23 |
+
|
24 |
+
PROMPT = "Write a long descriptive caption for this image in a formal tone."
|
25 |
+
|
26 |
+
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
|
27 |
+
ds = ds.shuffle(seed=42)
|
28 |
+
|
29 |
+
|
30 |
+
def preprocess_function(example):
|
31 |
+
# Build the conversation
|
32 |
+
convo = [
|
33 |
+
{
|
34 |
+
"role": "system",
|
35 |
+
"content": "You are a helpful image captioner.",
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"role": "user",
|
39 |
+
"content": PROMPT,
|
40 |
+
},
|
41 |
+
{"role": "assistant", "content": " ".join(example["caption"])},
|
42 |
+
]
|
43 |
+
|
44 |
+
# Format the conversation
|
45 |
+
# WARNING: HF's handling of chat's on Llava models is very fragile. This specific combination of processor.apply_chat_template(), and processor() works
|
46 |
+
# but if using other combinations always inspect the final input_ids to ensure they are correct. Often times you will end up with multiple <bos> tokens
|
47 |
+
# if not careful, which can make the model perform poorly.
|
48 |
+
convo_string = processor.apply_chat_template(
|
49 |
+
convo, tokenize=False, add_generation_prompt=True
|
50 |
+
)
|
51 |
+
assert isinstance(convo_string, str)
|
52 |
+
|
53 |
+
# Process the inputs
|
54 |
+
inputs = processor(
|
55 |
+
text=[convo_string], images=[example["image"]], return_tensors="pt"
|
56 |
+
).to("cuda")
|
57 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
58 |
+
return inputs
|
59 |
|
60 |
+
|
61 |
+
ds = ds.map(preprocess_function)
|
62 |
+
|
63 |
+
|
64 |
+
# Define a oneshot data collator for multimodal inputs.
|
65 |
+
def data_collator(batch):
|
66 |
+
assert len(batch) == 1
|
67 |
+
# return {key: torch.tensor(value) for key, value in batch[0].items()}
|
68 |
+
return {
|
69 |
+
"input_ids": torch.LongTensor(batch[0]["input_ids"]),
|
70 |
+
"attention_mask": torch.tensor(batch[0]["attention_mask"]),
|
71 |
+
"pixel_values": torch.tensor(batch[0]["pixel_values"]),
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
# Recipe
|
76 |
+
recipe = [
|
77 |
+
GPTQModifier(
|
78 |
+
targets="Linear",
|
79 |
+
scheme="W8A8",
|
80 |
+
sequential_targets=["LlamaDecoderLayer"],
|
81 |
+
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
|
82 |
+
),
|
83 |
+
]
|
84 |
+
SAVE_DIR = model_id + "-W8A8"
|
85 |
+
|
86 |
+
# Perform oneshot
|
87 |
+
oneshot(
|
88 |
+
model=model,
|
89 |
+
tokenizer=model_id,
|
90 |
+
dataset=ds,
|
91 |
+
splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
|
92 |
+
recipe=recipe,
|
93 |
+
max_seq_length=MAX_SEQUENCE_LENGTH,
|
94 |
+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
95 |
+
trust_remote_code_model=True,
|
96 |
+
data_collator=data_collator,
|
97 |
+
output_dir=SAVE_DIR,
|
98 |
+
)
|
99 |
|
100 |
# Confirm generations of the quantized model look sane.
|
101 |
print("========== SAMPLE GENERATION ==============")
|
102 |
+
messages = [
|
103 |
+
{
|
104 |
+
"role": "user",
|
105 |
+
"content": [
|
106 |
+
{"type": "text", "text": "Please describe the animal in this image\n"},
|
107 |
+
{"type": "image"},
|
108 |
+
],
|
109 |
+
},
|
110 |
+
]
|
111 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
112 |
+
image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
|
113 |
+
raw_image = Image.open(requests.get(image_url, stream=True).raw)
|
114 |
+
|
115 |
+
inputs = processor(images=[raw_image], text=prompt, return_tensors="pt").to("cuda")
|
116 |
+
output = model.generate(**inputs, max_new_tokens=100)
|
117 |
+
print(processor.decode(output[0], skip_special_tokens=True))
|
118 |
+
print("==========================================")
|