Ligeng-Zhu commited on
Commit
e2481b0
·
verified ·
1 Parent(s): 30c1060

Upload files with `vila-upload`.

Browse files

Upload configuration_vila.py
Upload config.json
Upload auto_processor.py
Upload modeling_vila.py

Files changed (4) hide show
  1. auto_processor.py +10 -8
  2. config.json +3 -3
  3. configuration_vila.py +2 -2
  4. modeling_vila.py +13 -11
auto_processor.py CHANGED
@@ -153,7 +153,9 @@ class VILAProcessor(ProcessorMixin):
153
  # image_processor_class = "VILAImageProcessor"
154
  # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
155
 
156
- def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs):
 
 
157
  self.image_token = MEDIA_TOKENS["image"]
158
  self.video_token = MEDIA_TOKENS["video"]
159
  self.config = config
@@ -161,11 +163,10 @@ class VILAProcessor(ProcessorMixin):
161
  self.tokenizer = tokenizer
162
  self.padding_side = padding_side
163
 
164
- # This is a special setting for Qwen.
165
  # self.pad_token_id = tokenizer.pad_token_id
166
- self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
167
  self.eos_token_id = self.tokenizer.eos_token_id
168
-
169
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
170
 
171
  @staticmethod
@@ -243,6 +244,7 @@ class VILAProcessor(ProcessorMixin):
243
  else:
244
  print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
245
  from huggingface_hub import snapshot_download
 
246
  pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
247
 
248
  image_processor = AutoImageProcessor.from_pretrained(
@@ -293,7 +295,7 @@ class VILAProcessor(ProcessorMixin):
293
  media[name] += feat.media[name]
294
  for name in feat.media_config:
295
  media_config[name].update(feat.media_config[name])
296
-
297
  # pad the input_ids to batchfy
298
  input_ids = pad_fn(
299
  input_ids_list,
@@ -354,18 +356,18 @@ class VILAProcessor(ProcessorMixin):
354
  images = images.half()
355
  media_config[name]["block_sizes"] = [block_sizes]
356
  else:
357
- images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
358
  media[name] = [image for image in images]
359
  elif name == "video":
360
  media[name] = [
361
- process_images(images, self.vision_tower.image_processor, self.config).half()
362
- for images in media[name]
363
  ]
364
  else:
365
  raise ValueError(f"Unsupported media type: {name}")
366
 
367
  inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
368
  input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
 
369
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
370
  return BatchFeature(
371
  data={
 
153
  # image_processor_class = "VILAImageProcessor"
154
  # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
155
 
156
+ def __init__(
157
+ self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs
158
+ ):
159
  self.image_token = MEDIA_TOKENS["image"]
160
  self.video_token = MEDIA_TOKENS["video"]
161
  self.config = config
 
163
  self.tokenizer = tokenizer
164
  self.padding_side = padding_side
165
 
166
+ # This is a special setting for Qwen.
167
  # self.pad_token_id = tokenizer.pad_token_id
168
+ self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
169
  self.eos_token_id = self.tokenizer.eos_token_id
 
170
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
171
 
172
  @staticmethod
 
244
  else:
245
  print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
246
  from huggingface_hub import snapshot_download
247
+
248
  pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
249
 
250
  image_processor = AutoImageProcessor.from_pretrained(
 
295
  media[name] += feat.media[name]
296
  for name in feat.media_config:
297
  media_config[name].update(feat.media_config[name])
298
+
299
  # pad the input_ids to batchfy
300
  input_ids = pad_fn(
301
  input_ids_list,
 
356
  images = images.half()
357
  media_config[name]["block_sizes"] = [block_sizes]
358
  else:
359
+ images = process_images(media["image"], self.image_processor, self.config).half()
360
  media[name] = [image for image in images]
361
  elif name == "video":
362
  media[name] = [
363
+ process_images(images, self.image_processor, self.config).half() for images in media[name]
 
364
  ]
365
  else:
366
  raise ValueError(f"Unsupported media type: {name}")
367
 
368
  inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
369
  input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
370
+
371
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
372
  return BatchFeature(
373
  data={
config.json CHANGED
@@ -2,12 +2,12 @@
2
  "_attn_implementation_autoset": true,
3
  "_name_or_path": "NVILA-Lite-8B-hf-preview",
4
  "architectures": [
5
- "VILAForCasualLM"
6
  ],
7
  "auto_map": {
8
  "AutoConfig": "configuration_vila.VILAConfig",
9
- "AutoModel": "modeling_vila.VILAForCasualLM",
10
- "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
11
  "AutoProcessor": "auto_processor.VILAProcessor"
12
  },
13
  "chat_template": null,
 
2
  "_attn_implementation_autoset": true,
3
  "_name_or_path": "NVILA-Lite-8B-hf-preview",
4
  "architectures": [
5
+ "VILAForCausalLM"
6
  ],
7
  "auto_map": {
8
  "AutoConfig": "configuration_vila.VILAConfig",
9
+ "AutoModel": "modeling_vila.VILAForCausalLM",
10
+ "AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
11
  "AutoProcessor": "auto_processor.VILAProcessor"
12
  },
13
  "chat_template": null,
configuration_vila.py CHANGED
@@ -57,7 +57,8 @@ class VILAConfig(PretrainedConfig):
57
  video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
58
  **kwargs,
59
  ):
60
- super().__init__()
 
61
  self.architectures = architectures
62
  self.llm_cfg = llm_cfg
63
  self.vision_tower_cfg = vision_tower_cfg
@@ -90,4 +91,3 @@ class VILAConfig(PretrainedConfig):
90
  self.image_encoder = image_encoder
91
  self.video_encoder = video_encoder
92
 
93
- super().__init__(**kwargs)
 
57
  video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
58
  **kwargs,
59
  ):
60
+ super().__init__(**kwargs)
61
+
62
  self.architectures = architectures
63
  self.llm_cfg = llm_cfg
64
  self.vision_tower_cfg = vision_tower_cfg
 
91
  self.image_encoder = image_encoder
92
  self.video_encoder = video_encoder
93
 
 
modeling_vila.py CHANGED
@@ -59,6 +59,7 @@ from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and
59
  # ease debugging
60
  python_input = input
61
 
 
62
  # quick hack for remote code
63
  def get_pg_manager():
64
  return None
@@ -191,6 +192,7 @@ class VILAPretrainedModel(PreTrainedModel):
191
  main_input_name = "input_embeds"
192
  supports_gradient_checkpointing = True
193
  _supports_flash_attn_2 = True
 
194
 
195
  def __init__(self, config: VILAConfig, *args, **kwargs):
196
  super().__init__(config)
@@ -268,12 +270,12 @@ class VILAPretrainedModel(PreTrainedModel):
268
  cfg_path = os.path.join(output_dir, "config.json")
269
  config = json.load(open(cfg_path))
270
  config["version"] = "2.0" # nvila tag
271
- config["architectures"] = ["VILAForCasualLM"]
272
  config["auto_map"] = {
273
  "AutoProcessor": "auto_processor.VILAProcessor",
274
  "AutoConfig": "modeling_vila.VILAConfig",
275
- "AutoModel": "modeling_vila.VILAForCasualLM",
276
- "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
277
  }
278
  # vila1.5 legacy support
279
  config["model_type"] = "vila"
@@ -501,7 +503,7 @@ class VILAPretrainedModel(PreTrainedModel):
501
  self.get_mm_projector().eval()
502
 
503
 
504
- class VILAForCasualLM(VILAPretrainedModel):
505
  def __init__(self, config: VILAConfig, *args, **kwargs):
506
  super().__init__(config, *args, **kwargs)
507
 
@@ -1082,7 +1084,8 @@ class VILAForCasualLM(VILAPretrainedModel):
1082
 
1083
  return outputs
1084
 
1085
- @torch.inference_mode()
 
1086
  def generate(
1087
  self,
1088
  input_ids: Optional[torch.FloatTensor] = None,
@@ -1100,14 +1103,13 @@ class VILAForCasualLM(VILAPretrainedModel):
1100
  input_emds: <media emd> 001 002 003 004
1101
  """
1102
  # NOTE: hard code to move to GPU
1103
- input_ids = input_ids.cuda()
1104
- media = {k: [v.cuda() for v in media[k]] for k in media}
1105
- if attention_mask is not None:
1106
- attention_mask = attention_mask.cuda()
1107
-
1108
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
1109
  output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
1110
-
1111
  if return_output_ids_only:
1112
  return_value = output_ids
1113
  else:
 
59
  # ease debugging
60
  python_input = input
61
 
62
+
63
  # quick hack for remote code
64
  def get_pg_manager():
65
  return None
 
192
  main_input_name = "input_embeds"
193
  supports_gradient_checkpointing = True
194
  _supports_flash_attn_2 = True
195
+ _no_split_modules = ["Qwen2DecoderLayer", "SiglipEncoderLayer"]
196
 
197
  def __init__(self, config: VILAConfig, *args, **kwargs):
198
  super().__init__(config)
 
270
  cfg_path = os.path.join(output_dir, "config.json")
271
  config = json.load(open(cfg_path))
272
  config["version"] = "2.0" # nvila tag
273
+ config["architectures"] = ["VILAForCausalLM"]
274
  config["auto_map"] = {
275
  "AutoProcessor": "auto_processor.VILAProcessor",
276
  "AutoConfig": "modeling_vila.VILAConfig",
277
+ "AutoModel": "modeling_vila.VILAForCausalLM",
278
+ "AutoModelForCausalLM": "modeling_vila.VILAForCausalLM",
279
  }
280
  # vila1.5 legacy support
281
  config["model_type"] = "vila"
 
503
  self.get_mm_projector().eval()
504
 
505
 
506
+ class VILAForCausalLM(VILAPretrainedModel):
507
  def __init__(self, config: VILAConfig, *args, **kwargs):
508
  super().__init__(config, *args, **kwargs)
509
 
 
1084
 
1085
  return outputs
1086
 
1087
+ # TODO(ligeng): check how qwen implements this function
1088
+ # @torch.inference_mode()
1089
  def generate(
1090
  self,
1091
  input_ids: Optional[torch.FloatTensor] = None,
 
1103
  input_emds: <media emd> 001 002 003 004
1104
  """
1105
  # NOTE: hard code to move to GPU
1106
+ # input_ids = input_ids.cuda()
1107
+ # media = {k: [v.cuda() if v is not None for v in media[k]] for k in media}
1108
+ # if attention_mask is not None:
1109
+ # attention_mask = attention_mask.cuda()
 
1110
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
1111
  output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
1112
+
1113
  if return_output_ids_only:
1114
  return_value = output_ids
1115
  else: