Upload files with `vila-upload`.
Browse filesUpload configuration_vila.py
Upload config.json
Upload auto_processor.py
Upload modeling_vila.py
- auto_processor.py +10 -8
- config.json +3 -3
- configuration_vila.py +2 -2
- modeling_vila.py +13 -11
auto_processor.py
CHANGED
@@ -153,7 +153,9 @@ class VILAProcessor(ProcessorMixin):
|
|
153 |
# image_processor_class = "VILAImageProcessor"
|
154 |
# tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
|
155 |
|
156 |
-
def __init__(
|
|
|
|
|
157 |
self.image_token = MEDIA_TOKENS["image"]
|
158 |
self.video_token = MEDIA_TOKENS["video"]
|
159 |
self.config = config
|
@@ -161,11 +163,10 @@ class VILAProcessor(ProcessorMixin):
|
|
161 |
self.tokenizer = tokenizer
|
162 |
self.padding_side = padding_side
|
163 |
|
164 |
-
# This is a special setting for Qwen.
|
165 |
# self.pad_token_id = tokenizer.pad_token_id
|
166 |
-
self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0]
|
167 |
self.eos_token_id = self.tokenizer.eos_token_id
|
168 |
-
|
169 |
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
170 |
|
171 |
@staticmethod
|
@@ -243,6 +244,7 @@ class VILAProcessor(ProcessorMixin):
|
|
243 |
else:
|
244 |
print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
|
245 |
from huggingface_hub import snapshot_download
|
|
|
246 |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
247 |
|
248 |
image_processor = AutoImageProcessor.from_pretrained(
|
@@ -293,7 +295,7 @@ class VILAProcessor(ProcessorMixin):
|
|
293 |
media[name] += feat.media[name]
|
294 |
for name in feat.media_config:
|
295 |
media_config[name].update(feat.media_config[name])
|
296 |
-
|
297 |
# pad the input_ids to batchfy
|
298 |
input_ids = pad_fn(
|
299 |
input_ids_list,
|
@@ -354,18 +356,18 @@ class VILAProcessor(ProcessorMixin):
|
|
354 |
images = images.half()
|
355 |
media_config[name]["block_sizes"] = [block_sizes]
|
356 |
else:
|
357 |
-
images = process_images(media["image"], self.
|
358 |
media[name] = [image for image in images]
|
359 |
elif name == "video":
|
360 |
media[name] = [
|
361 |
-
process_images(images, self.
|
362 |
-
for images in media[name]
|
363 |
]
|
364 |
else:
|
365 |
raise ValueError(f"Unsupported media type: {name}")
|
366 |
|
367 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
368 |
input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
|
|
|
369 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
370 |
return BatchFeature(
|
371 |
data={
|
|
|
153 |
# image_processor_class = "VILAImageProcessor"
|
154 |
# tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
|
155 |
|
156 |
+
def __init__(
|
157 |
+
self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs
|
158 |
+
):
|
159 |
self.image_token = MEDIA_TOKENS["image"]
|
160 |
self.video_token = MEDIA_TOKENS["video"]
|
161 |
self.config = config
|
|
|
163 |
self.tokenizer = tokenizer
|
164 |
self.padding_side = padding_side
|
165 |
|
166 |
+
# This is a special setting for Qwen.
|
167 |
# self.pad_token_id = tokenizer.pad_token_id
|
168 |
+
self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
|
169 |
self.eos_token_id = self.tokenizer.eos_token_id
|
|
|
170 |
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
171 |
|
172 |
@staticmethod
|
|
|
244 |
else:
|
245 |
print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
|
246 |
from huggingface_hub import snapshot_download
|
247 |
+
|
248 |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
249 |
|
250 |
image_processor = AutoImageProcessor.from_pretrained(
|
|
|
295 |
media[name] += feat.media[name]
|
296 |
for name in feat.media_config:
|
297 |
media_config[name].update(feat.media_config[name])
|
298 |
+
|
299 |
# pad the input_ids to batchfy
|
300 |
input_ids = pad_fn(
|
301 |
input_ids_list,
|
|
|
356 |
images = images.half()
|
357 |
media_config[name]["block_sizes"] = [block_sizes]
|
358 |
else:
|
359 |
+
images = process_images(media["image"], self.image_processor, self.config).half()
|
360 |
media[name] = [image for image in images]
|
361 |
elif name == "video":
|
362 |
media[name] = [
|
363 |
+
process_images(images, self.image_processor, self.config).half() for images in media[name]
|
|
|
364 |
]
|
365 |
else:
|
366 |
raise ValueError(f"Unsupported media type: {name}")
|
367 |
|
368 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
369 |
input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
|
370 |
+
|
371 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
372 |
return BatchFeature(
|
373 |
data={
|
config.json
CHANGED
@@ -2,12 +2,12 @@
|
|
2 |
"_attn_implementation_autoset": true,
|
3 |
"_name_or_path": "NVILA-Lite-8B-hf-preview",
|
4 |
"architectures": [
|
5 |
-
"
|
6 |
],
|
7 |
"auto_map": {
|
8 |
"AutoConfig": "configuration_vila.VILAConfig",
|
9 |
-
"AutoModel": "modeling_vila.
|
10 |
-
"AutoModelForCausalLM": "modeling_vila.
|
11 |
"AutoProcessor": "auto_processor.VILAProcessor"
|
12 |
},
|
13 |
"chat_template": null,
|
|
|
2 |
"_attn_implementation_autoset": true,
|
3 |
"_name_or_path": "NVILA-Lite-8B-hf-preview",
|
4 |
"architectures": [
|
5 |
+
"VILAForCausalLM"
|
6 |
],
|
7 |
"auto_map": {
|
8 |
"AutoConfig": "configuration_vila.VILAConfig",
|
9 |
+
"AutoModel": "modeling_vila.VILAForCausalLM",
|
10 |
+
"AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
|
11 |
"AutoProcessor": "auto_processor.VILAProcessor"
|
12 |
},
|
13 |
"chat_template": null,
|
configuration_vila.py
CHANGED
@@ -57,7 +57,8 @@ class VILAConfig(PretrainedConfig):
|
|
57 |
video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
|
58 |
**kwargs,
|
59 |
):
|
60 |
-
super().__init__()
|
|
|
61 |
self.architectures = architectures
|
62 |
self.llm_cfg = llm_cfg
|
63 |
self.vision_tower_cfg = vision_tower_cfg
|
@@ -90,4 +91,3 @@ class VILAConfig(PretrainedConfig):
|
|
90 |
self.image_encoder = image_encoder
|
91 |
self.video_encoder = video_encoder
|
92 |
|
93 |
-
super().__init__(**kwargs)
|
|
|
57 |
video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
|
58 |
**kwargs,
|
59 |
):
|
60 |
+
super().__init__(**kwargs)
|
61 |
+
|
62 |
self.architectures = architectures
|
63 |
self.llm_cfg = llm_cfg
|
64 |
self.vision_tower_cfg = vision_tower_cfg
|
|
|
91 |
self.image_encoder = image_encoder
|
92 |
self.video_encoder = video_encoder
|
93 |
|
|
modeling_vila.py
CHANGED
@@ -59,6 +59,7 @@ from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and
|
|
59 |
# ease debugging
|
60 |
python_input = input
|
61 |
|
|
|
62 |
# quick hack for remote code
|
63 |
def get_pg_manager():
|
64 |
return None
|
@@ -191,6 +192,7 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
191 |
main_input_name = "input_embeds"
|
192 |
supports_gradient_checkpointing = True
|
193 |
_supports_flash_attn_2 = True
|
|
|
194 |
|
195 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
196 |
super().__init__(config)
|
@@ -268,12 +270,12 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
268 |
cfg_path = os.path.join(output_dir, "config.json")
|
269 |
config = json.load(open(cfg_path))
|
270 |
config["version"] = "2.0" # nvila tag
|
271 |
-
config["architectures"] = ["
|
272 |
config["auto_map"] = {
|
273 |
"AutoProcessor": "auto_processor.VILAProcessor",
|
274 |
"AutoConfig": "modeling_vila.VILAConfig",
|
275 |
-
"AutoModel": "modeling_vila.
|
276 |
-
"AutoModelForCausalLM": "modeling_vila.
|
277 |
}
|
278 |
# vila1.5 legacy support
|
279 |
config["model_type"] = "vila"
|
@@ -501,7 +503,7 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
501 |
self.get_mm_projector().eval()
|
502 |
|
503 |
|
504 |
-
class
|
505 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
506 |
super().__init__(config, *args, **kwargs)
|
507 |
|
@@ -1082,7 +1084,8 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1082 |
|
1083 |
return outputs
|
1084 |
|
1085 |
-
|
|
|
1086 |
def generate(
|
1087 |
self,
|
1088 |
input_ids: Optional[torch.FloatTensor] = None,
|
@@ -1100,14 +1103,13 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1100 |
input_emds: <media emd> 001 002 003 004
|
1101 |
"""
|
1102 |
# NOTE: hard code to move to GPU
|
1103 |
-
input_ids = input_ids.cuda()
|
1104 |
-
media = {k: [v.cuda() for v in media[k]] for k in media}
|
1105 |
-
if attention_mask is not None:
|
1106 |
-
|
1107 |
-
|
1108 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
1109 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
1110 |
-
|
1111 |
if return_output_ids_only:
|
1112 |
return_value = output_ids
|
1113 |
else:
|
|
|
59 |
# ease debugging
|
60 |
python_input = input
|
61 |
|
62 |
+
|
63 |
# quick hack for remote code
|
64 |
def get_pg_manager():
|
65 |
return None
|
|
|
192 |
main_input_name = "input_embeds"
|
193 |
supports_gradient_checkpointing = True
|
194 |
_supports_flash_attn_2 = True
|
195 |
+
_no_split_modules = ["Qwen2DecoderLayer", "SiglipEncoderLayer"]
|
196 |
|
197 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
198 |
super().__init__(config)
|
|
|
270 |
cfg_path = os.path.join(output_dir, "config.json")
|
271 |
config = json.load(open(cfg_path))
|
272 |
config["version"] = "2.0" # nvila tag
|
273 |
+
config["architectures"] = ["VILAForCausalLM"]
|
274 |
config["auto_map"] = {
|
275 |
"AutoProcessor": "auto_processor.VILAProcessor",
|
276 |
"AutoConfig": "modeling_vila.VILAConfig",
|
277 |
+
"AutoModel": "modeling_vila.VILAForCausalLM",
|
278 |
+
"AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
|
279 |
}
|
280 |
# vila1.5 legacy support
|
281 |
config["model_type"] = "vila"
|
|
|
503 |
self.get_mm_projector().eval()
|
504 |
|
505 |
|
506 |
+
class VILAForCausalLM(VILAPretrainedModel):
|
507 |
def __init__(self, config: VILAConfig, *args, **kwargs):
|
508 |
super().__init__(config, *args, **kwargs)
|
509 |
|
|
|
1084 |
|
1085 |
return outputs
|
1086 |
|
1087 |
+
# TODO(ligeng): check how qwen implements this function
|
1088 |
+
# @torch.inference_mode()
|
1089 |
def generate(
|
1090 |
self,
|
1091 |
input_ids: Optional[torch.FloatTensor] = None,
|
|
|
1103 |
input_emds: <media emd> 001 002 003 004
|
1104 |
"""
|
1105 |
# NOTE: hard code to move to GPU
|
1106 |
+
# input_ids = input_ids.cuda()
|
1107 |
+
# media = {k: [v.cuda() if v is not None for v in media[k]] for k in media}
|
1108 |
+
# if attention_mask is not None:
|
1109 |
+
# attention_mask = attention_mask.cuda()
|
|
|
1110 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
1111 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
1112 |
+
|
1113 |
if return_output_ids_only:
|
1114 |
return_value = output_ids
|
1115 |
else:
|