delinqu commited on Mar 16

Commit

478fea7

verified ·

1 Parent(s): 5ace675

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +1 -0
action_tokenizer.py +431 -0
config.json +318 -0
configuration_spatialvla.py +121 -0
example.png +0 -0
generation_config.json +8 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_gemma2.py +1283 -0
modeling_spatialvla.py +528 -0
preprocessor_config.json +28 -0
processing_spatialvla.py +254 -0
processor_config.json +327 -0
simplerenv.md +26 -0
special_tokens_map.json +39 -0
test_huggingface.py +30 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

action_tokenizer.py ADDED Viewed

	@@ -0,0 +1,431 @@

+"""
+action_tokenizer.py
+Extension class; wraps base LLM/VLM tokenizer with logic to discretize and tokenize continuous robot actions.
+"""
+from typing import List, Union, Dict, Optional
+import numpy as np
+from transformers import PreTrainedTokenizerBase
+from scipy.stats import norm
+import torch
+ACTION_TOKEN = '<ACTION{:05d}>'
+class ActionTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: int = 256,
+        min_action: int = -1,
+        max_action: int = 1,
+    ):
+        self._vocab_size = num_bins
+        self.tokenizer = tokenizer
+        self.min_action, self.max_action = min_action, max_action
+        self.bin_centers = np.linspace(min_action, max_action, num_bins)
+        # add special action tokens to language tokenizer
+        token_list = [ACTION_TOKEN.format(i) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} TRANSLATION TOKENS, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.action_token_begin_idx = self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 7), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n, 7), tokens.
+        """
+        action = np.clip(action, a_min=float(self.min_action), a_max=float(self.max_action))
+        ids = np.digitize(action, self.bin_centers, right=True)  # [0, 255]
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n, 7), token ids.
+        return: np.ndarray, (n, 7), continuous actions
+        """
+        ids = action_token_id - self.action_token_begin_idx
+        ids = np.clip(ids, a_min=0, a_max=self._vocab_size - 1)
+        return self.bin_centers[ids]
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class TranslationTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        bin_policy: Optional[Dict] = None,
+        use_spherical: bool = True,
+    ):
+        self.tokenizer = tokenizer
+        self.num_theta_bins = num_bins["theta_bins"]
+        self.num_phi_bins = num_bins["phi_bins"]
+        self.num_r_bins = num_bins["r_bins"]
+        self.use_spherical = use_spherical
+        # for indexing
+        self.NP = self.num_phi_bins * self.num_r_bins
+        # add special action tokens to language tokenizer
+        self._vocab_size = self.num_theta_bins * self.num_phi_bins * self.num_r_bins
+        token_list = [ACTION_TOKEN.format(i) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} TRANSLATION TOKENS, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+        self.set_bins(bin_policy)
+    def set_bins(self, bin_policy):
+        self.theta_bins = np.array(bin_policy["theta_bins"])
+        self.phi_bins = np.array(bin_policy["phi_bins"])
+        self.r_bins = np.array(bin_policy["r_bins"])
+    def cartesian_to_spherical(self, x, y, z):
+        theta = np.arctan2(np.sqrt(x**2 + y**2), z)  # polar angle
+        phi = np.arctan2(y, x)  # azimuthal angle
+        r = np.sqrt(x**2 + y**2 + z**2)
+        return theta, phi, r
+    def spherical_to_cartesian(self, theta, phi, r):
+        x = r * np.sin(theta) * np.cos(phi)
+        y = r * np.sin(theta) * np.sin(phi)
+        z = r * np.cos(theta)
+        return x, y, z
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 3), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        if self.use_spherical:
+            theta, phi, r = self.cartesian_to_spherical(action[:, 0], action[:, 1], action[:, 2])
+        else:
+            theta, phi, r = action[:, 0], action[:, 1], action[:, 2]
+        disc_theta = np.digitize(theta, self.theta_bins[1:-1]) # b
+        disc_phi = np.digitize(phi, self.phi_bins[1:-1])
+        disc_r = np.digitize(r, self.r_bins[1:-1])
+        ids = disc_theta * self.NP + disc_phi * self.num_r_bins + disc_r
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 3), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, self.token_start_idx, self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        disc_theta, disc_phi, disc_r = ids // self.NP, (ids % self.NP) // self.num_r_bins, ids % self.num_r_bins
+        theta = 0.5 * (self.theta_bins[disc_theta] + self.theta_bins[disc_theta + 1])
+        phi = 0.5 * (self.phi_bins[disc_phi] + self.phi_bins[disc_phi + 1])
+        r = 0.5 * (self.r_bins[disc_r] + self.r_bins[disc_r + 1])
+        # clip action to [-1, 1], due to the spherical coordinate action space is the circumscribed sphere of the Cartesian action space.
+        x, y, z = self.spherical_to_cartesian(theta, phi, r) if self.use_spherical else (theta, phi, r)
+        x, y, z = np.clip([x, y, z], -1, 1)
+        return np.stack((x, y, z), axis=1)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class RotationTokenizer:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        bin_policy: Optional[Dict] = None,
+        array_begin_idx=None,
+    ):
+        self.tokenizer = tokenizer
+        self.num_roll_bins = num_bins["roll_bins"] # M
+        self.num_pitch_bins = num_bins["pitch_bins"] # N
+        self.num_yaw_bins = num_bins["yaw_bins"] # P
+        self.array_begin_idx = array_begin_idx
+        # for indexing
+        self.NP = self.num_pitch_bins * self.num_yaw_bins
+        # add special action tokens to language tokenizer
+        self._vocab_size = self.num_roll_bins * self.num_pitch_bins * self.num_yaw_bins
+        token_list = [ACTION_TOKEN.format(i + self.array_begin_idx) for i in range(self._vocab_size)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} ROTATION TOKENS to tokenizer, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+        self.set_bins(bin_policy)
+    def set_bins(self, bin_policy):
+        self.roll_bins = np.array(bin_policy["roll_bins"])
+        self.pitch_bins = np.array(bin_policy["pitch_bins"])
+        self.yaw_bins = np.array(bin_policy["yaw_bins"])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 3), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        roll, pitch, yaw = action[:, 0], action[:, 1], action[:, 2]
+        disc_roll = np.clip(np.digitize(roll, self.roll_bins) - 1, 0, self.num_roll_bins - 1)
+        disc_pitch = np.clip(np.digitize(pitch, self.pitch_bins) - 1, 0, self.num_pitch_bins - 1)
+        disc_yaw = np.clip(np.digitize(yaw, self.yaw_bins) - 1, 0, self.num_yaw_bins - 1)
+        ids = disc_roll * self.NP + disc_pitch * self.num_yaw_bins + disc_yaw
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: Union[np.int64, np.ndarray]) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 3), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, a_min=self.token_start_idx, a_max=self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        disc_roll, disc_pitch, disc_yaw = ids // self.NP, (ids % self.NP) // self.num_yaw_bins, ids % self.num_yaw_bins
+        roll = 0.5 * (self.roll_bins[disc_roll] + self.roll_bins[disc_roll + 1])
+        pitch = 0.5 * (self.pitch_bins[disc_pitch] + self.pitch_bins[disc_pitch + 1])
+        yaw = 0.5 * (self.yaw_bins[disc_yaw] + self.yaw_bins[disc_yaw + 1])
+        return np.stack((roll, pitch, yaw), axis=1)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+class GripperTokenzier:
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: int = 2,
+        array_begin_idx = None,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.num_bins = num_bins
+        self.array_begin_idx = array_begin_idx
+        token_list = [ACTION_TOKEN.format(i + self.array_begin_idx) for i in range(self.num_bins)]
+        self.token_array = np.array(token_list)
+        num_new_tokens = self.tokenizer.add_tokens(token_list, special_tokens=True)
+        print(f"Add {num_new_tokens} GRIPPER TOKENS to tokenizer, tokenizer vocab size {self.tokenizer.vocab_size} / {len(tokenizer)}")
+        self.token_start_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[0])
+        self.token_end_idx = self.tokenizer.convert_tokens_to_ids(self.token_array[-1])
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n,), continuous actions in Cartesian or Spherical coordinates.
+        return: np.ndarray, (n,), tokens.
+        """
+        ids = np.where(action >= 0.5, 1, 0)
+        return self.token_array[ids]
+    def decode_token_ids_to_actions(self, action_token_id: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_id: np.ndarray, (n,), token ids.
+        return: np.ndarray, (n, 1), continuous actions
+        """
+        action_token_id = np.clip(action_token_id, self.token_start_idx, self.token_end_idx)
+        ids = action_token_id - self.token_start_idx
+        actions = np.where(ids == 0, 0., 1.)
+        return actions[:, None]
+    @property
+    def vocab_size(self) -> int:
+        return self.num_bins
+class SpatialActionTokenizer:
+    range_bins = {
+        "translation": {
+            "theta_bins": (0.0, np.pi),
+            "phi_bins": (-np.pi, np.pi),
+            "r_bins": (0.0, np.sqrt(3)),
+        },
+        "rotation": {
+            "roll_bins": (-1.0, 1.0),
+            "pitch_bins": (-1.0, 1.0),
+            "yaw_bins": (-1.0, 1.0),
+        },
+    }
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_bins: Dict,
+        gs_params: Dict = None,
+        bin_policy: Dict = None,
+        use_spherical: bool = True,
+        min_sigma: float = 0.0,
+        min_action: float = -1.0,
+        max_action: float = 1.0,
+    ):
+        """set bin_policy if exist, otherwise, caculate bin_policy from gs_params or use uniform bin grids.
+        gs_params: Optional[Dict],
+        bin_policy: Optional[Dict],
+        """
+        self.tokenizer = tokenizer
+        self.min_action, self.max_action = min_action, max_action
+        self.num_bins = num_bins
+        self.min_sigma = min_sigma
+        # set bin policy
+        self.bin_policy = bin_policy if bin_policy else self.get_bin_policy(gs_params, self.min_sigma)
+        self.translation_tokenizer = TranslationTokenizer(
+            self.tokenizer,
+            self.num_bins["translation"],
+            self.bin_policy["translation"],
+            use_spherical=use_spherical
+        )
+        self.rotation_tokenizer = RotationTokenizer(
+            self.tokenizer,
+            self.num_bins["rotation"],
+            self.bin_policy["rotation"],
+            array_begin_idx=self.translation_tokenizer.vocab_size,
+        )
+        self.gripper_tokenizer = GripperTokenzier(
+            self.tokenizer,
+            self.num_bins["gripper"],
+            array_begin_idx=self.translation_tokenizer.vocab_size + self.rotation_tokenizer.vocab_size
+        )
+        self._vocab_size = self.translation_tokenizer.vocab_size + self.rotation_tokenizer.vocab_size + self.gripper_tokenizer.vocab_size
+    def __call__(self, action: np.ndarray) -> List[str]:
+        """Discretize continuous actions to tokens.
+        action: np.ndarray, (n, 7), continuous actions in Cartesian coordinates.
+        return: np.ndarray, (n, 3), tokens.
+        """
+        if len(action.shape) == 1:
+            assert action.shape[0] == 7, f"action dim mismatch, got action shape: {action.shape}"
+            action = action.reshape(1, 7)
+        assert action.shape[1] == 7, f"action dim mismatch, got action shape: {action.shape}"
+        action = np.clip(action, a_min=self.min_action, a_max=self.max_action)
+        trans_tokens = self.translation_tokenizer(action[:, :3]) # (n,)
+        rot_tokens = self.rotation_tokenizer(action[:, 3:6]) # (n,)
+        grip_tokens = self.gripper_tokenizer(action[:, 6]) # (n,)
+        return np.stack((trans_tokens, rot_tokens, grip_tokens), axis=1) # (n, 3)
+    def decode_token_ids_to_actions(self, action_token_ids: np.ndarray) -> np.ndarray:
+        """decode token ids to continuous actions.
+        action_token_ids: np.ndarray, (n, 3), token ids.
+        """
+        if len(action_token_ids.shape) == 1:
+            assert action_token_ids.shape[0] == 3, f"action token id numbers mismatich, need 3 got {action_token_ids.shape[0]}"
+            action_token_ids = action_token_ids.reshape(1, 3)
+        assert action_token_ids.shape[1] == 3, f"token id numbers mismatich, need 3 got {action_token_ids.shape[1]}"
+        trans_action = self.translation_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 0]) # (n, 3)
+        rot_action = self.rotation_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 1]) # (n, 3)
+        grip_action = self.gripper_tokenizer.decode_token_ids_to_actions(action_token_ids[:, 2]) # (n, 1)
+        return np.concatenate((trans_action, rot_action, grip_action), axis=1) # (n, 7)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+    @property
+    def action_token_begin_idx(self) -> int:
+        return self.translation_tokenizer.token_start_idx
+    def get_bin_policy(self, gs_params=None, min_sigma=0.0):
+        bin_policy = {
+            "translation": {"theta_bins": None, "phi_bins": None, "r_bins": None},
+            "rotation": {"roll_bins": None, "pitch_bins": None, "yaw_bins": None}
+        }
+        if gs_params is None:
+            for bin_type in self.range_bins.keys():
+                for bin_key in self.range_bins[bin_type].keys():
+                    bin_policy[bin_type][bin_key] = np.linspace(*self.range_bins[bin_type][bin_key], self.num_bins[bin_type][bin_key] + 1)
+            print(f"use unifrom bin grids ... \n{bin_policy}")
+        else:
+            for bin_type in self.range_bins.keys():
+                for bin_key in self.range_bins[bin_type].keys():
+                    mu = gs_params[bin_key.split("_")[0].lower()]["mu"]
+                    sigma = max(gs_params[bin_key.split("_")[0].lower()]["sigma"], min_sigma)
+                    bin_bound_prob = np.linspace(
+                        norm.cdf(self.range_bins[bin_type][bin_key][0], loc=mu, scale=sigma),
+                        norm.cdf(self.range_bins[bin_type][bin_key][1], loc=mu, scale=sigma),
+                        self.num_bins[bin_type][bin_key] + 1,
+                    )
+                    bin_boundary = norm.ppf(bin_bound_prob, loc=mu, scale=sigma)
+                    bin_policy[bin_type][bin_key] = np.clip(
+                            bin_boundary,
+                            self.range_bins[bin_type][bin_key][0],
+                            self.range_bins[bin_type][bin_key][1],
+                        ).tolist() # for serialize
+            print(f"caculate bin grids from gaussians \n{bin_policy}")
+        return bin_policy
+    def get_norm_meshgrid(self, bin_policy):
+        grids = []
+        policy = {k1: {k2: np.array(v2) for k2, v2 in v1.items()} for k1, v1 in bin_policy.items()}
+        # NOTE: use unify k,v order of range_bins (tpr, rpy)
+        for bin_type in self.range_bins.keys():
+            bounds = []
+            for bin_key in self.range_bins[bin_type].keys():
+                minb, maxb = self.range_bins[bin_type][bin_key][0], self.range_bins[bin_type][bin_key][1]
+                bin_boundary = policy[bin_type][bin_key]
+                bin_center = (bin_boundary[:-1] + bin_boundary[1:]) / 2
+                bin_center = np.concatenate([np.array([minb]),bin_center,np.array([maxb])]) # padding
+                bin_center = (bin_center - minb) /  (maxb - minb) # nomalize (m, n, k)
+                bounds.append(bin_center)
+            # generate grids
+            grid_x, grid_y, grid_z = np.meshgrid(*bounds)
+            grids += [np.stack([grid_x, grid_y, grid_z], -1).reshape(-1, 3)]
+        return grids[0], grids[1] # (N, 3)
+    def spatial_embedding_adaption(self, gs_params, embeddings: torch.nn.Embedding, min_sigma=0.0, adpt_feature=False):
+        """
+        gs_params0, gs_params1: Dict
+        embeddings: tensor (S,E)
+        """
+        from scipy.interpolate import griddata
+        new_policy = self.get_bin_policy(gs_params, min_sigma=min_sigma)
+        trans_grids0, rot_grids0 = self.get_norm_meshgrid(self.bin_policy)
+        trans_grids1, rot_grids1 = self.get_norm_meshgrid(new_policy)
+        print("overwrite bin policy and tokenizer bins ...")
+        self.bin_policy = new_policy
+        self.min_sigma = min_sigma
+        self.translation_tokenizer.set_bins(new_policy["translation"])
+        self.rotation_tokenizer.set_bins(new_policy["rotation"])
+        if adpt_feature:
+            emb_data = embeddings.weight.data # (S, e)
+            _, E = emb_data.shape
+            # translation
+            m, n, k = (self.num_bins["translation"][k] for k in ["theta_bins", "phi_bins", "r_bins"])
+            N = m*n*k
+            trans_emb_data = emb_data[:N,].reshape(m, n, k, -1).permute(3, 0, 1, 2) # (e, m, n, k)
+            pad_emb = torch.nn.functional.pad(trans_emb_data, (1, 1, 1, 1, 1, 1), "replicate").permute(1, 2, 3, 0).reshape(-1, E)
+            adpt_trans_emb = griddata(trans_grids0, pad_emb.float(), trans_grids1, method='linear')
+            adpt_trans_emb = adpt_trans_emb.reshape(m+2, n+2, k+2, E)[1:-1, 1:-1, 1:-1,]
+            # rotation
+            m1, n1, k1 = (self.num_bins["rotation"][k] for k in ["roll_bins", "pitch_bins", "yaw_bins"])
+            M = m1*n1*k1
+            rot_emb_data = emb_data[N : N + M,].reshape(m1, n1, k1, -1).permute(3, 0, 1, 2) # (e, m, n, k)
+            pad_emb = torch.nn.functional.pad(rot_emb_data, (1, 1, 1, 1, 1, 1), "replicate").permute(1, 2, 3, 0).reshape(-1, E)
+            adpt_rot_emb = griddata(rot_grids0, pad_emb.float(), rot_grids1, method='linear')
+            adpt_rot_emb = adpt_rot_emb.reshape(m1+2, n1+2, k1+2, E)[1:-1, 1:-1, 1:-1,]
+            # set data
+            device, dtype = embeddings.weight.data.device, embeddings.weight.data.dtype
+            embeddings.weight.data[:N] = torch.Tensor(adpt_trans_emb.reshape(-1, E), device=device).to(dtype)
+            embeddings.weight.data[N:N+M] = torch.Tensor(adpt_rot_emb.reshape(-1, E), device=device).to(dtype)
+            print("DONE! adapt spatial embedding to new gaussian distributation finished.")
+            print(embeddings.weight.data)

config.json ADDED Viewed

	@@ -0,0 +1,318 @@

+{
+  "_vocab_size": 265347,
+  "action_token_begin_idx": 257153,
+  "architectures": [
+    "SpatialVLAForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_spatialvla.SpatialVLAConfig",
+    "AutoModel": "modeling_spatialvla.SpatialVLAForConditionalGeneration"
+  },
+  "bos_token_id": 2,
+  "ego3d_patch_reso": 2,
+  "eos_token_id": 1,
+  "hidden_size": 2048,
+  "image_token_index": 257152,
+  "model_type": "spatialvla",
+  "n_freqs": 8,
+  "num_hidden_layers": 26,
+  "pad_token_id": 0,
+  "projection_dim": 2304,
+  "spatial_token_num": 8194,
+  "text_config": {
+    "architectures": [
+      "Gemma2ForCausalLM"
+    ],
+    "eos_token_id": [
+      1,
+      107
+    ],
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 2304,
+    "intermediate_size": 9216,
+    "model_type": "gemma2",
+    "num_hidden_layers": 26,
+    "num_image_tokens": 256,
+    "num_key_value_heads": 4,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 265347
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_spatial_token": true,
+  "use_vision_zoe": true,
+  "vision_config": {
+    "hidden_size": 1152,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "num_image_tokens": 256,
+    "num_positions": 256,
+    "patch_size": 14,
+    "projection_dim": 2304,
+    "torch_dtype": "bfloat16",
+    "vision_use_head": false
+  },
+  "vision_zoe_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "Intel/zoedepth-nyu-kitti",
+    "add_cross_attention": false,
+    "add_projection": false,
+    "architectures": [
+      "ZoeDepthForDepthEstimation"
+    ],
+    "attractor_alpha": 1000,
+    "attractor_gamma": 2,
+    "attractor_kind": "mean",
+    "backbone": null,
+    "backbone_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "add_fpn": false,
+      "architectures": null,
+      "attention_probs_dropout_prob": 0.0,
+      "auxiliary_channels": 256,
+      "auxiliary_concat_input": false,
+      "auxiliary_loss_weight": 0.4,
+      "auxiliary_num_convs": 1,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "drop_path_rate": 0.1,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "hidden_act": "gelu",
+      "hidden_dropout_prob": 0.0,
+      "hidden_size": 1024,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_size": 384,
+      "initializer_range": 0.02,
+      "intermediate_size": 4096,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layer_norm_eps": 1e-12,
+      "layer_scale_init_value": 0.1,
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "beit",
+      "no_repeat_ngram_size": 0,
+      "num_attention_heads": 16,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_channels": 3,
+      "num_hidden_layers": 24,
+      "num_return_sequences": 1,
+      "out_features": [
+        "stage6",
+        "stage12",
+        "stage18",
+        "stage24"
+      ],
+      "out_indices": [
+        6,
+        12,
+        18,
+        24
+      ],
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_size": 16,
+      "pool_scales": [
+        1,
+        2,
+        3,
+        6
+      ],
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "reshape_hidden_states": false,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "semantic_loss_ignore_index": 255,
+      "sep_token_id": null,
+      "stage_names": [
+        "stem",
+        "stage1",
+        "stage2",
+        "stage3",
+        "stage4",
+        "stage5",
+        "stage6",
+        "stage7",
+        "stage8",
+        "stage9",
+        "stage10",
+        "stage11",
+        "stage12",
+        "stage13",
+        "stage14",
+        "stage15",
+        "stage16",
+        "stage17",
+        "stage18",
+        "stage19",
+        "stage20",
+        "stage21",
+        "stage22",
+        "stage23",
+        "stage24"
+      ],
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_absolute_position_embeddings": false,
+      "use_auxiliary_head": true,
+      "use_bfloat16": false,
+      "use_mask_token": false,
+      "use_mean_pooling": true,
+      "use_relative_position_bias": true,
+      "use_shared_relative_position_bias": false,
+      "vocab_size": 8192
+    },
+    "backbone_hidden_size": 1024,
+    "bad_words_ids": null,
+    "batch_norm_eps": 1e-05,
+    "begin_suppress_tokens": null,
+    "bin_centers_type": "softplus",
+    "bin_configurations": [
+      {
+        "max_depth": 10.0,
+        "min_depth": 0.001,
+        "n_bins": 64,
+        "name": "nyu"
+      },
+      {
+        "max_depth": 80.0,
+        "min_depth": 0.001,
+        "n_bins": 64,
+        "name": "kitti"
+      }
+    ],
+    "bin_embedding_dim": 128,
+    "bos_token_id": null,
+    "bottleneck_features": 256,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fusion_hidden_size": 256,
+    "head_in_index": -1,
+    "hidden_act": "gelu",
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_temp": 50.0,
+    "min_length": 0,
+    "min_temp": 0.0212,
+    "model_type": "zoedepth",
+    "neck_hidden_sizes": [
+      256,
+      512,
+      1024,
+      1024
+    ],
+    "no_repeat_ngram_size": 0,
+    "num_attractors": [
+      16,
+      8,
+      4,
+      1
+    ],
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_patch_transformer_layers": 4,
+    "num_relative_features": 32,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_transformer_hidden_size": 128,
+    "patch_transformer_intermediate_size": 1024,
+    "patch_transformer_num_attention_heads": 4,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "readout_type": "project",
+    "reassemble_factors": [
+      4,
+      2,
+      1,
+      0.5
+    ],
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_batch_norm_in_fusion_residual": false,
+    "use_bfloat16": false,
+    "use_bias_in_fusion_residual": null,
+    "use_pretrained_backbone": false
+  }
+}

configuration_spatialvla.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# coding=utf-8
+# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PaliGemmamodel configuration"""
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class SpatialVLAConfig(PretrainedConfig):
+    model_type = "spatialvla"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig}
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=256000,
+        vocab_size=257152,
+        projection_dim=2048,
+        hidden_size=2048,
+        vision_zoe_config=None,
+        action_token_begin_idx=None,
+        spatial_token_num=259,
+        use_spatial_token=False,
+        ego3d_patch_reso=4,
+        n_freqs=8,
+        use_vision_zoe=True,
+        **kwargs,
+    ):
+        self._ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self._vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        self.hidden_size = hidden_size
+        self.vision_config = vision_config
+        self.is_encoder_decoder = False
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1152,
+                patch_size=14,
+                image_size=224,
+                num_hidden_layers=27,
+                num_attention_heads=16,
+                vocab_size=257152,
+                vision_use_head=False,
+            )
+        self.text_config = text_config
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma2"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["gemma2"](
+                hidden_size=2048,
+                num_hidden_layers=18,
+                intermediate_size=16384,
+                num_attention_heads=8,
+                num_key_value_heads=1,
+                is_encoder_decoder=False,
+                vocab_size=vocab_size,
+            )
+        self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
+        self.vision_config.projection_dim = projection_dim
+        # vision zoe config
+        self.vision_zoe_config = vision_zoe_config
+        if isinstance(self.vision_zoe_config, dict):
+            vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth"
+            self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config)
+        else:
+            pass
+        # additional attributes
+        self.action_token_begin_idx = action_token_begin_idx
+        self.spatial_token_num = spatial_token_num
+        self.use_spatial_token = use_spatial_token
+        self.ego3d_patch_reso = ego3d_patch_reso
+        self.n_freqs = n_freqs
+        self.use_vision_zoe = use_vision_zoe
+        super().__init__(**kwargs)
+    @property
+    def ignore_index(self):
+        warnings.warn(
+            "The `ignore_index` attribute is deprecated and will be removed in v4.47.",
+            FutureWarning,
+        )
+        return self._ignore_index
+    @ignore_index.setter
+    def ignore_index(self, value):
+        self._ignore_index = value
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_ignore_index", None)
+        return output

example.png ADDED Viewed

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.0"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:014d7ca643d9ecb48a3689e7d2b3875b2ea1ae71fc422f8179ae50c786a608eb
+size 4969426016

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22501b6216c652d94bb4917d85bc13d891b6aeb66ff785300436c1e8f8e5ed07
+size 3086476734

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_gemma2.py ADDED Viewed

	@@ -0,0 +1,1283 @@

+# custom gemma2 to support flash_attention_2,
+# source from https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/gemma2/modeling_gemma2.py
+# coding=utf-8
+# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, HybridCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal,
+    is_torch_greater_or_equal,
+    logging,
+    replace_return_docstrings,
+    is_flash_attn_greater_or_equal_2_10,
+)
+from transformers import Gemma2Config
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "google/gemma2-7b"
+_CONFIG_FOR_DOC = "Gemma2Config"
+class Gemma2RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+class Gemma2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class Gemma2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    **_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    key_states = repeat_kv(key, config.num_key_value_groups)
+    value_states = repeat_kv(value, config.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
+    if config.attn_logit_softcapping is not None:
+        attn_weights = attn_weights / config.attn_logit_softcapping
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * config.attn_logit_softcapping
+    if mask is not None:  # no matter the length, we just slice it
+        causal_mask = mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def flash_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    target_dtype: torch.dtype = torch.float16,
+    **_kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # NOTE: None mask cause un defined https://github.com/huggingface/transformers/blob/c8c8dffbe45ebef0a8dba4a51024e5e5e498596b/src/transformers/models/gemma2/modeling_gemma2.py#L211
+    seq_len = query.shape[2]
+    if mask is not None:
+        query = query[:, :, :seq_len]
+        value = value[:, :, :seq_len]
+    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
+    query_states = query.transpose(1, 2)
+    key_states = key.transpose(1, 2)
+    value_states = value.transpose(1, 2)
+    dropout_rate = config.attention_dropout if config.training else 0.0
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+    attn_output = _flash_attention_forward(
+        query_states,
+        key_states,
+        value_states,
+        mask,
+        seq_len,
+        dropout=dropout_rate,
+        softmax_scale=config.scaling,
+        is_causal=config.is_causal,
+        sliding_window=config.sliding_window,
+        use_top_left_mask=config._flash_attn_uses_top_left_mask,
+        softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
+    )
+    return attn_output, None
+def flex_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    output_attentions: bool = False,
+    **_kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    def tanh_softcap(score, b, h, q_idx, kv_idx):
+        soft_cap = config.attn_logit_softcapping
+        score = soft_cap * torch.tanh(score / soft_cap)
+        if mask is not None:
+            return score + mask[b][0][q_idx][kv_idx]
+        return score
+    attn_output = flex_attention(
+        query,
+        key,
+        value,
+        score_mod=tanh_softcap,
+        enable_gqa=True,
+        scale=config.scaling,
+        return_lse=output_attentions,
+    )
+    if not output_attentions:
+        attn_weights = None
+    else:
+        attn_output, attn_weights = attn_output
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def sdpa_attention_forward(
+    config: Gemma2Config,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: Optional[torch.Tensor],
+    **_kwargs,
+) -> Tuple[torch.Tensor, None]:
+    key = repeat_kv(key, config.num_key_value_groups)
+    value = repeat_kv(value, config.num_key_value_groups)
+    causal_mask = mask
+    if mask is not None:
+        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query.device.type == "cuda" and causal_mask is not None:
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=causal_mask,
+        dropout_p=config.attention_dropout if config.training else 0.0,
+        is_causal=is_causal,
+        scale=config.scaling,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, None
+GEMMA2_ATTENTION_FUNCTION = {
+    "flash_attention_2": flash_attention_forward,
+    "flex_attention": flex_attention_forward,
+    "eager": eager_attention_forward,
+    "sdpa": sdpa_attention_forward,
+}
+class Gemma2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
+        self.attn_logit_softcapping = config.attn_logit_softcapping
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.rotary_emb = Gemma2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        # NOTE: gemma2 do not include _flash_attn_uses_top_left_mask for flash attention
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
+            logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
+            attention_type = "flex_attention"
+        else:
+            attention_type = self.config._attn_implementation
+        attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
+            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class Gemma2FlashAttention2(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "flash_attention_2"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+        )
+class Gemma2SdpaAttention(Gemma2Attention):
+    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.config._attn_implementation = "sdpa"
+        logger.warning_once(
+            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
+            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+        )
+class Gemma2DecoderLayer(nn.Module):
+    def __init__(self, config: Gemma2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.is_sliding = not bool(layer_idx % 2)
+        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma2MLP(config)
+        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.sliding_window = config.sliding_window
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
+            # Flash-attn is a 2D tensor
+            if self.config._attn_implementation == "flash_attention_2":
+                if past_key_value is not None:  # when decoding
+                    attention_mask = attention_mask[:, -self.sliding_window :]
+            else:
+                min_dtype = torch.finfo(hidden_states.dtype).min
+                sliding_window_mask = torch.tril(
+                    torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window
+                )
+                attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
+                if attention_mask.shape[-1] <= 1:  # when decoding
+                    attention_mask = attention_mask[:, :, :, -self.sliding_window :]
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+GEMMA2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Gemma2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2PreTrainedModel(PreTrainedModel):
+    config_class = Gemma2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Gemma2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = False
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False):
+        """
+        Overloads `PreTrainedModel._check_and_enable_sdpa` so as to DISABLE torch SDPA by default on Gemma2 models.
+        SDPA reduces the model performance on Gemma2 because of the logits softcapping.
+        """
+        config = super()._check_and_enable_sdpa(config, hard_check_only=hard_check_only)
+        # if using the default path -> swap sdpa by eager
+        if not hard_check_only and config._attn_implementation == "sdpa":
+            config._attn_implementation = "eager"
+        return config
+GEMMA2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2Model(Gemma2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Gemma2DecoderLayer`]
+    Args:
+        config: Gemma2Config
+    """
+    def __init__(self, config: Gemma2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None and not self.training:
+            batch_size, seq_len, _ = inputs_embeds.shape
+            past_key_values = HybridCache(
+                self.config,
+                batch_size=batch_size,
+                max_cache_len=seq_len,
+                device=self.device,
+                dtype=inputs_embeds.dtype,
+            )
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        # embed positions
+        hidden_states = inputs_embeds
+        # normalized
+        # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+        hidden_states = hidden_states * normalizer
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = past_key_values if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    @torch.no_grad()
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: HybridCache,
+        output_attentions: bool,
+    ):
+        # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache.
+        # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
+        # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible
+        # as it doesn't cause dynamic control issues.
+        if self.config._attn_implementation == "flash_attention_2":
+            return attention_mask
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Gemma2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, GemmaForCausalLM
+        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten: has a special cache type, `HybridCache`
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
+                # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
+                # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
+                # batch size = 1 case, `position_ids` is already contiguous but with varying stride
+                # which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if (
+            isinstance(past_key_values, HybridCache)
+            and attention_mask.ndim == 2
+            and not self.config._attn_implementation == "flash_attention_2"
+        ):
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+@add_start_docstrings(
+    """
+    The Gemma2 Model transformer with a sequence classification head on top (linear layer).
+    [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2ForSequenceClassification(Gemma2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    The Gemma2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    GEMMA2_START_DOCSTRING,
+)
+class Gemma2ForTokenClassification(Gemma2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

modeling_spatialvla.py ADDED Viewed

	@@ -0,0 +1,528 @@

+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PaliGemmamodel."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import os
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.linalg import inv
+import torchvision.transforms.functional as TF
+import torch.nn.functional as F
+from transformers.cache_utils import Cache, HybridCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel, PretrainedConfig
+from transformers.utils import (
+    ModelOutput,
+    logging,
+)
+from .configuration_spatialvla import SpatialVLAConfig
+from .modeling_gemma2 import Gemma2ForCausalLM
+from transformers import AutoModel, ZoeDepthForDepthEstimation
+SIGLIP_MEAN, SIGLIP_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+logger = logging.get_logger(__name__)
+class Ego3DPositionEmbeddingMLP(nn.Module):
+    """Absolute pos embedding, learned.
+    https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4
+    """
+    def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True):
+        super(Ego3DPositionEmbeddingMLP, self).__init__()
+        self.n_freqs = n_freqs
+        self.freq_out_channels = in_channels * (2 * n_freqs + 1)
+        if logscale:
+            freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs)
+        else:
+            freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs)
+        center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3)
+        self.register_buffer("freq_bands", freq_bands, persistent=False)
+        self.register_buffer("center", center, persistent=False)
+        self.position_embedding_head = nn.Sequential(
+            nn.Linear(self.freq_out_channels, num_pos_feats),
+            nn.LayerNorm(num_pos_feats),
+            nn.ReLU(),
+            nn.Linear(num_pos_feats, num_pos_feats),
+        )
+        self._reset_parameters()
+    def _reset_parameters(self):
+        """init with small weights to maintain stable training."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p, gain=0.01)
+    @torch.no_grad()
+    def frequency_encoding(self, xyz):
+        """
+        Embeds x to (x, sin(2^k x), cos(2^k x), ...)
+        Different from the paper, "x" is also in the output
+        See https://github.com/bmild/nerf/issues/12
+        x \in [-2, 2]
+        y \in [-2, 2]
+        z \in [0., 4]
+        Inputs:
+            x: (b n m)
+        Outputs:
+            out: (b n o)
+        """
+        xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype)
+        xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands  # (b n m 1)
+        sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq)  # (b n m nf)
+        encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1)
+        return encoding
+    def forward(self, xyz):
+        """Forward pass, xyz is (B, N, 3or6), output (B, N, F)."""
+        freq_encoding = self.frequency_encoding(xyz)
+        position_embedding = self.position_embedding_head(freq_encoding)
+        return position_embedding
+def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)):
+    """https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py"""
+    # h, w = images.shape[-2:]
+    # pad
+    ph, pw = 31, 31  # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31
+    images = F.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode)
+    # resize
+    size = (384, 384)  # get_resize_output_image_size
+    images = F.interpolate(images, size=size, mode="bicubic", align_corners=True)
+    # zoe: padding -> resize -> nomalize. we follow `nomalize -> padding -> resize` from siglip
+    images = TF.normalize(images, mean=ZOE_MEAN, std=ZOE_STD)
+    return images, ph, pw
+@dataclass
+class SpatialVLACausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+class SpatialVLAMultiModalProjector(nn.Module):
+    def __init__(self, config: SpatialVLAConfig):
+        super().__init__()
+        self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True)
+    def forward(self, image_features):
+        hidden_states = self.linear(image_features)
+        return hidden_states
+class SpatialVLAPreTrainedModel(PreTrainedModel):
+    config_class = SpatialVLAConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SpatialVLAMultiModalProjector", "ZoeDepthForDepthEstimation", "Ego3DPositionEmbeddingMLP"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class SpatialVLAForConditionalGeneration(SpatialVLAPreTrainedModel, GenerationMixin):
+    def __init__(self, config: SpatialVLAConfig, vision_model=None, vision_zoe_model=None, projector_model=None, language_model=None):
+        super().__init__(config)
+        self.vision_tower = vision_model or AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = projector_model or SpatialVLAMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        if language_model is None:
+            language_model = Gemma2ForCausalLM(config=config.text_config)
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        self.language_model = language_model
+        if config.use_vision_zoe:
+            self.vision_zoe_model = vision_zoe_model or ZoeDepthForDepthEstimation(config.vision_zoe_config)
+            self.position_embedding_3d = Ego3DPositionEmbeddingMLP(
+                config.ego3d_patch_reso**2 * 3, num_pos_feats=config.vision_config.hidden_size, n_freqs=config.n_freqs
+            )
+            # register buffer
+            patch_size, reso, image_size = config.vision_config.patch_size, config.ego3d_patch_reso, config.vision_config.image_size
+            y, x = torch.meshgrid(torch.arange(0, image_size, patch_size // reso), torch.arange(0, image_size, patch_size // reso), indexing="ij")  # (h//sp w//sp)
+            y, x = y + patch_size / reso / 2, x + patch_size / reso / 2
+            uv_h = torch.stack([x, y, torch.ones_like(x)], dim=0).reshape(3, -1)  # (3 hw)
+            self.register_buffer("uv_h", uv_h, persistent=False)
+        # shared spatial embeddings for <ACTION> <IMG>
+        if config.use_spatial_token:
+            self.spatial_embed_tokens = nn.Embedding(self.config.spatial_token_num, config.text_config.hidden_size)
+        else:
+            self.spatial_embed_tokens = None
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+    def backproject_patch(self, K: torch.Tensor, depth: torch.Tensor, patch_size=14, reso=2) -> torch.Tensor:
+        """
+        Backproject depth map to 3D points in camera coordinate.
+        Args:
+            K: camera intrinsic matrix (b 3 3)
+            depth: depth map (b 1 h w)
+            patch_size: patch size for siglip
+            reso: reso^2 -> sample points in each patch
+        patch sz = 14  ......
+        ┌────────┬────────┐
+        │ ─    ─ │ ─    ─ │
+        │ points │        ├─ ─ ─
+        │ ─    ─ │ ─    ─ │
+        ├────────┼────────┤
+        │ ─    ─ │ ─    ─ │
+        │        │        │
+        │ ─    ─ │ ─    ─ │
+        └────────┴────────┘
+        reso=2───►points=4
+            │
+            │
+        """
+        b, c, h, w = depth.shape
+        hp, wp = h // patch_size, w // patch_size
+        sub_hp = sub_wp = reso
+        patch_depth = F.interpolate(depth, size=(hp * reso, wp * reso), mode="area").reshape(b, c, -1)
+        p_cam = (inv(K.float()) @ self.uv_h.float()) * patch_depth  # (b 3 3) @ (3 hw) -> (b 3 hw) * (b 1 hw) -> (b 3 hw)
+        patch_p_cam = p_cam.reshape(b, 3, hp, sub_hp, wp, sub_wp).permute(0, 2, 4, 3, 5, 1).reshape(b, hp * wp, -1)
+        return patch_p_cam
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        mean_resizing: bool = True,
+    ) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        vocab_size = model_embeds.weight.shape[0]
+        self.config.text_config.vocab_size = self.vocab_size = self.config._vocab_size = vocab_size
+        self.tie_weights()
+        return model_embeds
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        token_type_ids,
+        past_key_values,
+        cache_position,
+        input_ids=None,
+        inputs_embeds=None,
+        is_training: bool = False,
+    ):
+        if self.config.text_config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        min_dtype = torch.finfo(self.dtype).min
+        inputs_lead_dim = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+        sequence_length = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        elif isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+        if attention_mask is not None and attention_mask.dim() == 4:
+            return attention_mask
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device)
+        if sequence_length != 1:
+            if is_training: causal_mask = torch.triu(causal_mask, diagonal=1)
+            else: causal_mask[:, :sequence_length] = 0.0
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0)
+        return causal_mask
+    def get_image_features(self, pixel_values: torch.FloatTensor, intrinsic: torch.FloatTensor):
+        siglip_pixel_values = TF.normalize(pixel_values, mean=SIGLIP_MEAN, std=SIGLIP_STD)
+        image_outputs = self.vision_tower(siglip_pixel_values)
+        # ego3d position encoding
+        if self.config.use_vision_zoe:
+            zoe_pixel_values, ph, pw = process_zoe(pixel_values, pad_mode="reflect")
+            with torch.no_grad():
+                pvh, pvw = pixel_values.shape[-2:]
+                depth = self.vision_zoe_model(pixel_values=zoe_pixel_values).predicted_depth
+                depth = F.interpolate(
+                    depth.unsqueeze(1),
+                    size=(pvh+2*ph, pvw+2*pw),
+                    mode="bicubic",
+                    align_corners=True,
+                )[..., ph:-ph, pw:-pw]
+                xyz = self.backproject_patch(
+                    intrinsic, depth, patch_size=self.config.vision_config.patch_size, reso=self.config.ego3d_patch_reso
+                )  # (b, n, 3*4)
+            pos_embed_3d = self.position_embedding_3d(xyz)
+            selected_image_feature = image_outputs.last_hidden_state + pos_embed_3d
+        else:
+            selected_image_feature = image_outputs.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features / (self.config.text_config.hidden_size**0.5)
+        return image_features
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        actions: Optional[torch.FloatTensor] = None,
+        intrinsic: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, SpatialVLACausalLMOutputWithPast]:
+        output_attentions = output_attentions or self.config.output_attentions
+        output_hidden_states = output_hidden_states or self.config.output_hidden_states
+        return_dict = return_dict or self.config.use_return_dict
+        is_training = token_type_ids is not None and labels is not None
+        if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids).clone() # avoid checkpint grad True
+        if self.config.use_spatial_token:
+            spatial_selected = (input_ids >= self.config.action_token_begin_idx) & (input_ids < self.config.action_token_begin_idx + self.config.spatial_token_num)
+            inputs_embeds[spatial_selected] = inputs_embeds[spatial_selected] * 0.0 + self.spatial_embed_tokens(input_ids[spatial_selected] - self.config.action_token_begin_idx)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+        # merge
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values, intrinsic)
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training
+        )
+        outputs = self.language_model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return SpatialVLACausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+    # AR inference
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        intrinsic=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            num_logits_to_keep=num_logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        is_training = token_type_ids is not None and labels is not None
+        if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+            causal_mask = self._update_causal_mask(attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training)
+            model_inputs["attention_mask"] = causal_mask
+        model_inputs["intrinsic"] = intrinsic
+        return model_inputs
+    @torch.no_grad()
+    def predict_action(
+        self,
+        model_inputs,
+    ) -> torch.Tensor:
+        model_inputs = model_inputs.to(torch.bfloat16).to(self.device)
+        input_len = model_inputs["input_ids"].shape[-1]
+        generation_outputs = self.generate(**model_inputs, max_new_tokens=256, do_sample=False)
+        return generation_outputs[:,input_len:]
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        if model.config.use_spatial_token:
+            model.language_model.model.embed_tokens.weight.data[-model.config.spatial_token_num:] = model.spatial_embed_tokens.weight.data
+        return model

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_spatialvla.SpatialVLAProcessor"
+  },
+  "do_convert_rgb": null,
+  "do_normalize": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SpatialVLAProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

processing_spatialvla.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import List, Optional, Union, Dict
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import Unpack, _validate_images_text_input_order, ProcessorMixin
+from transformers.tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
+from transformers.utils import logging
+from transformers.models.paligemma.processing_paligemma import (
+    make_batched_images,
+    build_string_from_input,
+    _is_str_or_image,
+    PaliGemmaProcessorKwargs,
+    IMAGE_TOKEN,
+    EXTRA_TOKENS
+)
+from .action_tokenizer import SpatialActionTokenizer
+logger = logging.get_logger(__name__)
+class SpatialVLAProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        statistics: Optional[dict] = None,
+        bin_policy=None,
+        intrinsic_config=None,
+        action_config=None,
+        num_obs_steps=1,
+        obs_delta=1,
+        action_chunk_size=1,
+        min_sigma=0.0,
+        **kwargs,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+        self.image_seq_length = image_processor.image_seq_length
+        if not hasattr(tokenizer, "image_token"):
+            image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
+            tokens_to_add = {"additional_special_tokens": [image_token]}
+            tokenizer.add_special_tokens(tokens_to_add)
+            self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        else:
+            self.image_token_id = tokenizer.image_token_id
+        tokenizer.add_tokens(EXTRA_TOKENS)
+        tokenizer.add_bos_token = False
+        tokenizer.add_eos_token = False
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        # action tokenizer
+        self.statistics = statistics if statistics else {}
+        self.bin_policy = bin_policy
+        self.min_sigma = min_sigma
+        self.intrinsic_config = intrinsic_config
+        self.action_config = action_config
+        self.num_obs_steps = num_obs_steps
+        self.obs_delta = obs_delta
+        self.action_chunk_size = action_chunk_size
+        self.dataset_intrinsics = {}
+        height, width = image_processor.size["height"], image_processor.size["width"]
+        # scale intrinsic matrix
+        for k, v in intrinsic_config.items():
+            K = torch.tensor(v["intrinsic"]).float()
+            K[:2] *= torch.tensor([width / v["width"], height / v["height"]])[:, None]
+            self.dataset_intrinsics[k] = K
+        self.action_tokenizer = SpatialActionTokenizer(
+            tokenizer=tokenizer, num_bins=action_config["num_bins"],
+            bin_policy=bin_policy, use_spherical=action_config["use_spherical"],
+            min_sigma=min_sigma,
+        )
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        unnorm_key: Optional[str] = None,
+        suffix_actions: Optional[np.array] = None, # (t e)
+        **kwargs: Unpack[PaliGemmaProcessorKwargs],
+    ) -> BatchFeature:
+        images, text = _validate_images_text_input_order(images, text)
+        output_kwargs = self._merge_kwargs(
+            PaliGemmaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if suffix_actions is not None:
+            action_tokens = self.action_tokenizer(suffix_actions) # (n,3)
+            suffix="".join(action_tokens.flatten())
+        else:
+            suffix = output_kwargs["text_kwargs"].pop("suffix", None)
+        return_token_type_ids = True if suffix is not None else False
+        if images is None:
+            raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
+        if text is None:
+            logger.warning_once( "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model.")
+            text = ""
+        if _is_str_or_image(text):
+            text = [text]
+        elif isinstance(text, list) and _is_str_or_image(text[0]):
+            pass
+        if text is not None and images is not None:
+            if not any(IMAGE_TOKEN in sample for sample in text):
+                if isinstance(text, List) and isinstance(images, List):
+                    if len(images) != len(text):
+                        raise ValueError(
+                            f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image or list of images."
+                        )
+                if is_valid_image(images):
+                    images = [[images]]
+                elif isinstance(images, list) and is_valid_image(images[0]):
+                    images = [[image] for image in images]
+                elif not (isinstance(images, list) and isinstance(images[0], list) and is_valid_image(images[0][0])):
+                    raise ValueError("images must be an image, list of images or list of list of images")
+                if suffix is not None and _is_str_or_image(suffix): suffix = [suffix]
+                if suffix is not None: suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
+                input_strings = [
+                    build_string_from_input(
+                        prompt=prompt,
+                        bos_token=self.tokenizer.bos_token,
+                        image_seq_len=self.image_seq_length,
+                        image_token=IMAGE_TOKEN,
+                        num_images=len(image_list) if isinstance(image_list, list) else 1,
+                    )
+                    for prompt, image_list in zip(text, images)
+                ]
+                images = make_batched_images(images)
+            else:
+                expanded_samples = []
+                for sample in text:
+                    expanded_sample = sample.replace(IMAGE_TOKEN, IMAGE_TOKEN * self.image_seq_length)
+                    bos_rfind_index = expanded_sample.rfind(IMAGE_TOKEN)
+                    bos_index = bos_rfind_index + len(IMAGE_TOKEN) if bos_rfind_index != -1 else 0
+                    expanded_sample = (
+                        expanded_sample[:bos_index] + self.tokenizer.bos_token + expanded_sample[bos_index:]
+                    )
+                    expanded_samples.append(expanded_sample)
+                input_strings = [f"{sample}\n" for sample in expanded_samples]
+        pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+        if output_kwargs["text_kwargs"].get("max_length", None) is not None:
+            output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
+        inputs = self.tokenizer(
+            input_strings,
+            text_pair=suffix,
+            return_token_type_ids=return_token_type_ids,
+            **output_kwargs["text_kwargs"],
+        )
+        intrinsic = self.dataset_intrinsics[unnorm_key] if unnorm_key in self.dataset_intrinsics else self.dataset_intrinsics["default"]
+        return_data = {**inputs, "pixel_values": pixel_values, "intrinsic": intrinsic}
+        if return_token_type_ids:
+            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+            return_data.update({"labels": labels})
+        return BatchFeature(data=return_data)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def decode_actions(
+        self,
+        generation_outputs: torch.Tensor,
+        unnorm_key: Optional[str] = None,
+    ) -> Dict[str, torch.Tensor]:
+        action_token_num = 3  # translation + rotation + gripper
+        predicted_action_token_ids = generation_outputs[0, : action_token_num * self.action_chunk_size].detach().cpu().long().numpy()
+        assert self.tokenizer.eos_token != predicted_action_token_ids[-1], "[error] actions contain EOS token, please check you truncation settings!"
+        if predicted_action_token_ids.shape[0] < action_token_num * self.action_chunk_size:  # pad with zeros
+            logger.warning(f"Padding zero action!")
+            predicted_action_token_ids = np.concatenate(
+                [
+                    predicted_action_token_ids,
+                    np.zeros(action_token_num * self.action_chunk_size - predicted_action_token_ids.shape[0], dtype=np.longlong),
+                ]
+            )
+        predicted_action_token_ids = predicted_action_token_ids.reshape(-1, action_token_num)
+        normalized_action_chunks = self.action_tokenizer.decode_token_ids_to_actions(predicted_action_token_ids)
+        if unnorm_key is None:
+            logger.warning(f"unnorm_key {unnorm_key} is not in statistics, use next one")
+            unnorm_key = next(self.statistics.keys())
+        action_norm_stats = self.statistics[unnorm_key]["action"]
+        action_dim = len(action_norm_stats["q01"])
+        mask = np.array(action_norm_stats.get("mask", np.ones(action_dim)), dtype=bool)
+        action_high, action_low = np.array(action_norm_stats["q99"]), np.array(action_norm_stats["q01"])
+        actions = []
+        for normalized_actions in normalized_action_chunks:
+            action = np.where(
+                mask,
+                0.5 * (normalized_actions + 1) * (action_high - action_low) + action_low,
+                normalized_actions,
+            )
+            actions.append(action)
+        actions = np.stack(actions)
+        return {"actions": actions, "action_ids": predicted_action_token_ids}

processor_config.json ADDED Viewed

	@@ -0,0 +1,327 @@

+{
+  "action_chunk_size": 4,
+  "action_config": {
+    "distribution": "gaussian",
+    "num_bins": {
+      "gripper": 2,
+      "rotation": {
+        "pitch_bins": 16,
+        "roll_bins": 16,
+        "yaw_bins": 16
+      },
+      "total": 8194,
+      "translation": {
+        "phi_bins": 32,
+        "r_bins": 8,
+        "theta_bins": 16
+      }
+    },
+    "use_spherical": true
+  },
+  "auto_map": {
+    "AutoProcessor": "processing_spatialvla.SpatialVLAProcessor"
+  },
+  "bin_policy": {
+    "rotation": {
+      "pitch_bins": [
+        -1.0,
+        -0.4236293919771139,
+        -0.2973624970533583,
+        -0.21059576820767317,
+        -0.14044938844843713,
+        -0.0791789125851777,
+        -0.023048480293744636,
+        0.030167161843358437,
+        0.08204200739679071,
+        0.13389374587953162,
+        0.18703587338481154,
+        0.24302765601977616,
+        0.30406026229156,
+        0.37378821800324374,
+        0.45971873753598247,
+        0.5836276162507279,
+        0.9999999999999991
+      ],
+      "roll_bins": [
+        -0.9999999999999999,
+        -0.48696292418679255,
+        -0.3676073739484146,
+        -0.28549591499691584,
+        -0.21907612836502022,
+        -0.16103745543314568,
+        -0.10784881328909159,
+        -0.05740408497876547,
+        -0.00821079709993185,
+        0.040983744804115825,
+        0.0914324636886914,
+        0.144628635967148,
+        0.20268023967111456,
+        0.269122809861373,
+        0.35127995163586373,
+        0.4707654855904555,
+        0.9999999999999944
+      ],
+      "yaw_bins": [
+        -1.0,
+        -0.4473279373756505,
+        -0.3332741619243962,
+        -0.25494122059754437,
+        -0.19161826850058544,
+        -0.1363039890445066,
+        -0.08562203792073503,
+        -0.03756062019257189,
+        0.009304860859811767,
+        0.05616950282205181,
+        0.1042282501882964,
+        0.15490516155832307,
+        0.21021078414249433,
+        0.2735184749468475,
+        0.35182078330381356,
+        0.465787139096136,
+        0.9999999999999982
+      ]
+    },
+    "translation": {
+      "phi_bins": [
+        -3.141592653589793,
+        -2.611427824867527,
+        -2.250204012654159,
+        -1.9664312602343461,
+        -1.727567317192397,
+        -1.5180333466123621,
+        -1.3290717520482633,
+        -1.1552219136523942,
+        -0.9928174267972283,
+        -0.8392525074770641,
+        -0.6925871222960145,
+        -0.5513178350935227,
+        -0.41423640072445,
+        -0.28033770999881874,
+        -0.14875675757685075,
+        -0.018723165750234833,
+        0.11047361805186211,
+        0.2395128839618976,
+        0.3690681218889241,
+        0.49983192073784344,
+        0.6325427359682341,
+        0.7680163128439619,
+        0.9071854848022353,
+        1.0511538919389105,
+        1.2012725735857557,
+        1.359254858953288,
+        1.52735781547609,
+        1.708685638209645,
+        1.9077325684228925,
+        2.1314415012063312,
+        2.3915198815314898,
+        2.710422326959981,
+        3.141592653589793
+      ],
+      "r_bins": [
+        0.0,
+        0.24715317617636928,
+        0.3738653185927623,
+        0.4741546344271254,
+        0.5660713758244397,
+        0.6591763123588074,
+        0.7640208367398835,
+        0.905077308623254,
+        1.7320508075688772
+      ],
+      "theta_bins": [
+        0.0,
+        0.9482227818534477,
+        1.232949635587941,
+        1.4288683204982662,
+        1.586471048273713,
+        1.7230822806307542,
+        1.8470152323808435,
+        1.9631023836372554,
+        2.0745890527961355,
+        2.1839605665055863,
+        2.2933911513280534,
+        2.405063409356251,
+        2.521491080766048,
+        2.6459805006534918,
+        2.7834919014248793,
+        2.942634872432456,
+        3.141592653589793
+      ]
+    }
+  },
+  "intrinsic_config": {
+    "bridge_orig/1.0.0": {
+      "height": 480,
+      "intrinsic": [
+        [
+          623.588,
+          0,
+          319.501
+        ],
+        [
+          0,
+          623.588,
+          239.545
+        ],
+        [
+          0,
+          0,
+          1
+        ]
+      ],
+      "width": 640
+    },
+    "default": {
+      "height": 480,
+      "intrinsic": [
+        [
+          623.588,
+          0,
+          319.501
+        ],
+        [
+          0,
+          623.588,
+          239.545
+        ],
+        [
+          0,
+          0,
+          1
+        ]
+      ],
+      "width": 640
+    }
+  },
+  "min_sigma": 0.0,
+  "num_obs_steps": 1,
+  "obs_delta": 1,
+  "processor_class": "SpatialVLAProcessor",
+  "statistics": {
+    "bridge_orig/1.0.0": {
+      "action": {
+        "mask": [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          false
+        ],
+        "max": [
+          0.41691166162490845,
+          0.25864794850349426,
+          0.21218234300613403,
+          3.122201919555664,
+          1.8618112802505493,
+          6.280478477478027,
+          1.0
+        ],
+        "mean": [
+          0.00023341862834058702,
+          0.00013004240463487804,
+          -0.0001276263064937666,
+          -0.0001556586939841509,
+          -0.0004039350023958832,
+          0.0002355838514631614,
+          0.5764582753181458
+        ],
+        "min": [
+          -0.4007510244846344,
+          -0.13874775171279907,
+          -0.22553899884223938,
+          -3.2010786533355713,
+          -1.8618112802505493,
+          -6.279075622558594,
+          0.0
+        ],
+        "q01": [
+          -0.02872725307941437,
+          -0.04170349963009357,
+          -0.026093858778476715,
+          -0.08092105075716972,
+          -0.09288699507713317,
+          -0.20718276381492615,
+          0.0
+        ],
+        "q99": [
+          0.028309678435325586,
+          0.040855254605412394,
+          0.040161586627364146,
+          0.08192047759890528,
+          0.07792850524187081,
+          0.20382574498653397,
+          1.0
+        ],
+        "std": [
+          0.009765730239450932,
+          0.013689513318240643,
+          0.012667140923440456,
+          0.02853446453809738,
+          0.030637893825769424,
+          0.07691765576601028,
+          0.4973663091659546
+        ]
+      },
+      "num_trajectories": 60064,
+      "num_transitions": 2135463,
+      "proprio": {
+        "max": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "mean": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "min": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q01": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "q99": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "std": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
+      }
+    }
+  }
+}

simplerenv.md ADDED Viewed

	@@ -0,0 +1,26 @@

+|                                                    | 0                   | 1               | 2         | 3      | 4      | 5         | 6          | 7           | 8       | 9       |
+|:---------------------------------------------------|:--------------------|:----------------|:----------|:-------|:-------|:----------|:-----------|:------------|:--------|:--------|
+| coke_can/matching_avg                              | nan                 | 0.857           | 0.71      | 0.567  | 0.787  | 0.17      | nan        | 0.027       | 0.163   | 0.727   |
+| coke_can/variant_avg                               | nan                 | 0.898           | 0.813     | 0.49   | 0.823  | 0.006     | nan        | 0.022       | 0.545   | nan     |
+| coke_can/matching/horizontal                       | nan                 | 0.96            | 0.86      | 0.82   | 0.74   | 0.21      | nan        | 0.05        | 0.27    | 0.85    |
+| coke_can/matching/vertical                         | nan                 | 0.9             | 0.79      | 0.33   | 0.74   | 0.21      | nan        | 0.0         | 0.03    | 0.43    |
+| coke_can/matching/standing                         | nan                 | 0.71            | 0.48      | 0.55   | 0.88   | 0.09      | nan        | 0.03        | 0.19    | 0.9     |
+| coke_can/variant/horizontal                        | nan                 | 0.969           | 0.92      | 0.569  | 0.822  | 0.005     | nan        | 0.022       | 0.711   | nan     |
+| coke_can/variant/vertical                          | nan                 | 0.76            | 0.704     | 0.204  | 0.754  | 0.0       | nan        | 0.013       | 0.271   | nan     |
+| coke_can/variant/standing                          | nan                 | 0.964           | 0.813     | 0.698  | 0.893  | 0.013     | nan        | 0.031       | 0.653   | nan     |
+| move_near/variant                                  | nan                 | 0.5             | 0.446     | 0.323  | 0.792  | 0.031     | nan        | 0.04        | 0.477   | nan     |
+| move_near/matching                                 | nan                 | 0.442           | 0.354     | 0.317  | 0.779  | 0.042     | nan        | 0.05        | 0.462   | 0.663   |
+| drawer/matching_avg                                | nan                 | 0.73            | 0.565     | 0.597  | 0.25   | 0.227     | nan        | 0.139       | 0.356   | 0.268   |
+| drawer/variant_avg                                 | nan                 | 0.323           | 0.267     | 0.294  | 0.353  | 0.011     | nan        | 0.069       | 0.177   | nan     |
+| drawer/matching/open                               | nan                 | 0.601           | 0.463     | 0.296  | 0.157  | 0.009     | nan        | 0.0         | 0.194   | 0.287   |
+| drawer/matching/close                              | nan                 | 0.861           | 0.667     | 0.891  | 0.343  | 0.444     | nan        | 0.278       | 0.518   | 0.25    |
+| drawer/variant/open                                | nan                 | 0.27            | 0.212     | 0.069  | 0.333  | 0.0       | nan        | 0.005       | 0.158   | nan     |
+| drawer/variant/close                               | nan                 | 0.376           | 0.323     | 0.519  | 0.372  | 0.021     | nan        | 0.132       | 0.195   | nan     |
+| put_spoon_on_tablecloth/matching_partial           | 0.20833333333333334 | nan             | nan       | 0.167  | nan    | 0.347     | 0.778      | nan         | 0.041   | 0.375   |
+| put_spoon_on_tablecloth/matching_entire            | 0.16666666666666666 | nan             | nan       | 0.0    | nan    | 0.125     | 0.472      | nan         | 0.0     | 0.208   |
+| put_carrot_on_plate/matching_partial               | 0.2916666666666667  | nan             | nan       | 0.208  | nan    | 0.528     | 0.278      | nan         | 0.333   | 0.333   |
+| put_carrot_on_plate/matching_entire                | 0.25                | nan             | nan       | 0.042  | nan    | 0.083     | 0.097      | nan         | 0.0     | 0.25    |
+| stack_green_block_on_yellow_block/matching_partial | 0.625               | nan             | nan       | 0.083  | nan    | 0.319     | 0.403      | nan         | 0.125   | 0.083   |
+| stack_green_block_on_yellow_block/matching_entire  | 0.2916666666666667  | nan             | nan       | 0.0    | nan    | 0.0       | 0.042      | nan         | 0.0     | 0.083   |
+| put_eggplant_in_basket/matching_partial            | 1.0                 | nan             | nan       | 0.0    | nan    | 0.667     | 0.875      | nan         | 0.083   | 0.0     |
+| put_eggplant_in_basket/matching_entire             | 1.0                 | nan             | nan       | 0.0    | nan    | 0.431     | 0.569      | nan         | 0.041   | 0.0     |

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_huggingface.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import argparse
+from pathlib import Path
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+parser = argparse.ArgumentParser("Huggingface AutoModel Tesing")
+parser.add_argument("--model_name_or_path", default=".", help="pretrained model name or path.")
+parser.add_argument("--num_images", type=int, default=1, help="num_images for testing.")
+args = parser.parse_args()
+if __name__ == "__main__":
+    model_name_or_path = Path(args.model_name_or_path)
+    processor = AutoProcessor.from_pretrained(args.model_name_or_path, trust_remote_code=True)
+    print(processor.statistics)
+    model = AutoModel.from_pretrained(args.model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16).eval().cuda()
+    image = Image.open("example.png").convert("RGB")
+    images = [image] * args.num_images
+    prompt = "What action should the robot take to pick the cup?"
+    inputs = processor(images=images, text=prompt, unnorm_key="bridge_orig/1.0.0", return_tensors="pt")
+    print(inputs)
+    generation_outputs = model.predict_action(inputs)
+    print(generation_outputs, processor.batch_decode(generation_outputs))
+    actions = processor.decode_actions(generation_outputs, unnorm_key="bridge_orig/1.0.0")
+    print(actions)

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2523a63c898ebf0a32c7282a2e459ef2c950a846c5f3172305089e4149b6b6c3
+size 36157680

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff