ammarnasr
/

t5mimo-bare-conv

Feature Extraction

Transformers

Safetensors

t5mimo

custom_code

Model card Files Files and versions Community

ammarnasr commited on Sep 3, 2024

Commit

69336ee

verified ·

1 Parent(s): 248a174

Upload model

Browse files

Files changed (1) hide show

modeling_t5mimo.py +18 -6

modeling_t5mimo.py CHANGED Viewed

@@ -314,6 +314,7 @@ class T5Attention(nn.Module):
         # Input is (batch_size, seq_length, dim)
         # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
         # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
         if self.config.is_mimo:
             batch_size, multivar_dim, seq_length = hidden_states.shape[:3]
         else:
@@ -402,7 +403,6 @@ class T5Attention(nn.Module):
         if position_bias is None:
             if not self.has_relative_attention_bias:
                 if self.config.is_mimo:
@@ -414,6 +414,7 @@ class T5Attention(nn.Module):
             else:
                 position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
             # if key and values are already calculated
             # we want only the last query position bias
             if past_key_value is not None:
@@ -427,6 +428,7 @@ class T5Attention(nn.Module):
         if self.pruned_heads:
             mask = torch.ones(position_bias.shape[1])
             mask[list(self.pruned_heads)] = 0
@@ -434,6 +436,7 @@ class T5Attention(nn.Module):
         else:
             position_bias_masked = position_bias
         scores += position_bias_masked
         attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (batch_size, n_heads, seq_length, key_length)
         attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)  # (batch_size, n_heads, seq_length, key_length)
@@ -909,20 +912,24 @@ class T5Stack(T5PreTrainedModel):
         # initialize past_key_values with `None` if past does not exist
         if past_key_values is None:
             past_key_values = [None] * len(self.block)
         if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
         if self.config.is_mimo:
             extended_attention_mask = self.get_extended_attention_mask(attention_mask, (input_shape[0], input_shape[2]))
-            extended_attention_mask = extended_attention_mask.unsqueeze(1)
         else:
             extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
@@ -934,11 +941,16 @@ class T5Stack(T5PreTrainedModel):
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long)
             if self.config.is_mimo:
-                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-                encoder_extended_attention_mask = encoder_extended_attention_mask.unsqueeze(1)
             else:
                 encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None

         # Input is (batch_size, seq_length, dim)
         # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
         # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
         if self.config.is_mimo:
             batch_size, multivar_dim, seq_length = hidden_states.shape[:3]
         else:
         if position_bias is None:
             if not self.has_relative_attention_bias:
                 if self.config.is_mimo:
             else:
                 position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
             # if key and values are already calculated
             # we want only the last query position bias
             if past_key_value is not None:
         if self.pruned_heads:
             mask = torch.ones(position_bias.shape[1])
             mask[list(self.pruned_heads)] = 0
         else:
             position_bias_masked = position_bias
         scores += position_bias_masked
         attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (batch_size, n_heads, seq_length, key_length)
         attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)  # (batch_size, n_heads, seq_length, key_length)
         # initialize past_key_values with `None` if past does not exist
         if past_key_values is None:
             past_key_values = [None] * len(self.block)
         if attention_mask is None:
+            if self.config.is_mimo:
+                attention_mask = torch.ones(batch_size,multivar_seqs, mask_seq_length, device=inputs_embeds.device)
+            else:
+                attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
         if self.config.is_mimo:
             extended_attention_mask = self.get_extended_attention_mask(attention_mask, (input_shape[0], input_shape[2]))
+            extended_attention_mask = extended_attention_mask.transpose(1,2).unsqueeze(2)
         else:
             extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long)
             if self.config.is_mimo:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask).transpose(1,2)
+                encoder_extended_attention_mask = encoder_extended_attention_mask.unsqueeze(2)
             else:
                 encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None