import os import math import torch from torch import nn from functools import partial import torch.nn.functional as F class Adapter_Template(nn.Module): def __init__(self, config): super().__init__() self.gradient_checkpointing = False def freeze_module(self, module): for p in module.parameters(): p.requires_grad = False def forward(self, inputs, add_start_end=True): input_ids, hidden_states, targets, attn_mask, loss_mask = inputs image_features = self.forward_adapter_modules(hidden_states) return (input_ids, image_features, targets, attn_mask, loss_mask) class AdapterSigLIP(Adapter_Template): def __init__(self, config): super().__init__(config) self.p0 = nn.Sequential( nn.LayerNorm(config.vision_config.hidden_size*4), nn.Linear(config.vision_config.hidden_size*4, config.intermediate_size), nn.GELU(), nn.Linear(config.intermediate_size, config.intermediate_size), nn.GELU(), ) self.proj = nn.Linear(config.intermediate_size, config.vision_config.proj_output_dim) def freeze(self): self.freeze_module(self.p0) self.freeze_module(self.proj) def pixel_shuffle(self, x, scale_factor=0.5): n, w, h, c = x.size() if w % 2 == 0 and h % 2 == 0: # N, W, H, C --> N, W, H * scale, C // scale x = x.reshape(n, w, int(h * scale_factor), int(c / scale_factor)) # N, W, H * scale, C // scale --> N, H * scale, W, C // scale x = x.permute(0, 2, 1, 3).contiguous() # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) x = x.view(n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor))) return x def forward_adapter_modules(self, hidden_states): h = w = int(hidden_states.shape[1] ** 0.5) hidden_states = hidden_states.reshape(hidden_states.shape[0], h, w, -1) hidden_states = self.pixel_shuffle(hidden_states, scale_factor=0.5) hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1]) hidden_states = self.proj(self.p0(hidden_states)) return hidden_states