Upload quantize.py with huggingface_hub
Browse files- quantize.py +26 -111
quantize.py
CHANGED
@@ -1,118 +1,33 @@
|
|
1 |
-
import
|
2 |
-
import requests
|
3 |
-
import torch
|
4 |
-
from PIL import Image
|
5 |
-
from transformers import AutoProcessor
|
6 |
-
from datasets import load_dataset
|
7 |
-
from llmcompressor import oneshot
|
8 |
-
from llmcompressor.modifiers.quantization import GPTQModifier
|
9 |
-
from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
model = TraceableLlavaForConditionalGeneration.from_pretrained(
|
14 |
-
model_id, device_map="auto", torch_dtype="bfloat16"
|
15 |
-
)
|
16 |
-
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
17 |
-
|
18 |
-
# Oneshot arguments
|
19 |
-
DATASET_ID = "lmms-lab/flickr30k"
|
20 |
-
DATASET_SPLIT = "test"
|
21 |
-
NUM_CALIBRATION_SAMPLES = 512
|
22 |
-
MAX_SEQUENCE_LENGTH = 2048
|
23 |
-
|
24 |
-
PROMPT = "Write a long descriptive caption for this image in a formal tone."
|
25 |
-
|
26 |
-
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
|
27 |
-
ds = ds.shuffle(seed=42)
|
28 |
-
|
29 |
-
|
30 |
-
def preprocess_function(example):
|
31 |
-
# Build the conversation
|
32 |
-
convo = [
|
33 |
-
{
|
34 |
-
"role": "system",
|
35 |
-
"content": "You are a helpful image captioner.",
|
36 |
-
},
|
37 |
-
{
|
38 |
-
"role": "user",
|
39 |
-
"content": PROMPT,
|
40 |
-
},
|
41 |
-
{"role": "assistant", "content": " ".join(example["caption"])},
|
42 |
-
]
|
43 |
-
|
44 |
-
# Format the conversation
|
45 |
-
# WARNING: HF's handling of chat's on Llava models is very fragile. This specific combination of processor.apply_chat_template(), and processor() works
|
46 |
-
# but if using other combinations always inspect the final input_ids to ensure they are correct. Often times you will end up with multiple <bos> tokens
|
47 |
-
# if not careful, which can make the model perform poorly.
|
48 |
-
convo_string = processor.apply_chat_template(
|
49 |
-
convo, tokenize=False, add_generation_prompt=True
|
50 |
-
)
|
51 |
-
assert isinstance(convo_string, str)
|
52 |
-
|
53 |
-
# Process the inputs
|
54 |
-
inputs = processor(
|
55 |
-
text=[convo_string], images=[example["image"]], return_tensors="pt"
|
56 |
-
).to("cuda")
|
57 |
-
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
58 |
-
return inputs
|
59 |
|
|
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
# Recipe
|
76 |
-
recipe = [
|
77 |
-
GPTQModifier(
|
78 |
-
targets="Linear",
|
79 |
-
scheme="W8A8",
|
80 |
-
sequential_targets=["LlamaDecoderLayer"],
|
81 |
-
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
|
82 |
-
),
|
83 |
-
]
|
84 |
-
SAVE_DIR = model_id + "-W8A8"
|
85 |
-
|
86 |
-
# Perform oneshot
|
87 |
-
oneshot(
|
88 |
-
model=model,
|
89 |
-
tokenizer=model_id,
|
90 |
-
dataset=ds,
|
91 |
-
splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
|
92 |
-
recipe=recipe,
|
93 |
-
max_seq_length=MAX_SEQUENCE_LENGTH,
|
94 |
-
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
95 |
-
trust_remote_code_model=True,
|
96 |
-
data_collator=data_collator,
|
97 |
-
output_dir=SAVE_DIR,
|
98 |
)
|
99 |
|
|
|
|
|
|
|
|
|
|
|
100 |
# Confirm generations of the quantized model look sane.
|
101 |
print("========== SAMPLE GENERATION ==============")
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
{"type": "text", "text": "Please describe the animal in this image\n"},
|
107 |
-
{"type": "image"},
|
108 |
-
],
|
109 |
-
},
|
110 |
-
]
|
111 |
-
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
112 |
-
image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
|
113 |
-
raw_image = Image.open(requests.get(image_url, stream=True).raw)
|
114 |
-
|
115 |
-
inputs = processor(images=[raw_image], text=prompt, return_tensors="pt").to("cuda")
|
116 |
-
output = model.generate(**inputs, max_new_tokens=100)
|
117 |
-
print(processor.decode(output[0], skip_special_tokens=True))
|
118 |
-
print("==========================================")
|
|
|
1 |
+
from transformers import AutoProcessor, LlavaForConditionalGeneration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
4 |
+
from llmcompressor.transformers import oneshot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
MODEL_ID = "llama-joycaption-beta-one-hf-llava"
|
7 |
|
8 |
+
# Load model.
|
9 |
+
model_class = LlavaForConditionalGeneration
|
10 |
+
model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
11 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
12 |
+
|
13 |
+
# Configure the quantization algorithm and scheme.
|
14 |
+
# In this case, we:
|
15 |
+
# * quantize the weights to fp8 with per channel via ptq
|
16 |
+
# * quantize the activations to fp8 with dynamic per token
|
17 |
+
recipe = QuantizationModifier(
|
18 |
+
targets="Linear",
|
19 |
+
scheme="FP8_DYNAMIC",
|
20 |
+
ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
)
|
22 |
|
23 |
+
# Apply quantization and save to disk in compressed-tensors format.
|
24 |
+
SAVE_DIR = MODEL_ID + "-FP8-Dynamic"
|
25 |
+
oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
|
26 |
+
processor.save_pretrained(SAVE_DIR)
|
27 |
+
|
28 |
# Confirm generations of the quantized model look sane.
|
29 |
print("========== SAMPLE GENERATION ==============")
|
30 |
+
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
|
31 |
+
output = model.generate(input_ids, max_new_tokens=20)
|
32 |
+
print(processor.decode(output[0]))
|
33 |
+
print("==========================================")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|