ccdv
/

lsg-camembert-base-4096

@@ -46,6 +46,7 @@ You can change various parameters like :
 * local block size (block_size=128)
 * sparse block size (sparse_block_size=128)
 * sparsity factor (sparsity_factor=2)
 * see config.json file
 Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.

 * local block size (block_size=128)
 * sparse block size (sparse_block_size=128)
 * sparsity factor (sparsity_factor=2)
+* mask_first_token (mask first token since it is redundant with the first global token)
 * see config.json file
 Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.

modeling_lsg_camembert.py CHANGED Viewed

@@ -182,7 +182,11 @@ class CausalAttentionProduct(nn.Module):
  # Add causal mask
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
- causal_mask = torch.tril(torch.ones(*causal_shape, device=attention_mask.device), diagonal=-1).T * (-10000)
  attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
  del attention_mask
@@ -300,7 +304,7 @@ class LSGAttentionProduct(nn.Module):
  # Pad before block reshaping
  if is_attn_mask:
- pad_value = -10000
  hidden_states = hidden_states.transpose(-1, -2)
  else:
  pad_value = 0
@@ -333,7 +337,7 @@ class LSGAttentionProduct(nn.Module):
  # Pad before block reshaping
  if is_attn_mask:
- pad_value = -10000
  hidden_states = hidden_states.transpose(-1, -2)
  else:
  pad_value = 0
@@ -557,7 +561,7 @@ class LSGSelfAttention(BaseSelfAttention):
  keys = keys.sum(dim=-2) / (mask + 1e-6)
  values = values.sum(dim=-2) / (mask + 1e-6)
- mask = - (1. - mask.clamp(0, 1)) * 1e4
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
  def get_sparse_tokens_with_stride(self, keys, values, mask):
@@ -622,7 +626,7 @@ class LSGSelfAttention(BaseSelfAttention):
  keys /= mask + 1e-8
  values /= mask + 1e-8
- mask = -10000 * (1. - mask.clamp(0, 1))
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
@@ -988,7 +992,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
  n, t = inputs_.size()[:2]
  if attention_mask is None:
- attention_mask = torch.ones(n, t, device=inputs_.device)
  if self.mask_first_token:
  attention_mask[:,0] = 0
@@ -1069,7 +1073,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
  )
  extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
- extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
  return extended_attention_mask

  # Add causal mask
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
+ causal_mask = torch.tril(
+ torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
+ diagonal=-1
+ )
+ causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
  attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
  del attention_mask
  # Pad before block reshaping
  if is_attn_mask:
+ pad_value = torch.finfo(hidden_states.dtype).min
  hidden_states = hidden_states.transpose(-1, -2)
  else:
  pad_value = 0
  # Pad before block reshaping
  if is_attn_mask:
+ pad_value = torch.finfo(hidden_states.dtype).min
  hidden_states = hidden_states.transpose(-1, -2)
  else:
  pad_value = 0
  keys = keys.sum(dim=-2) / (mask + 1e-6)
  values = values.sum(dim=-2) / (mask + 1e-6)
+ mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
  def get_sparse_tokens_with_stride(self, keys, values, mask):
  keys /= mask + 1e-8
  values /= mask + 1e-8
+ mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
  return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
  n, t = inputs_.size()[:2]
  if attention_mask is None:
+ attention_mask = torch.ones(n, t, device=inputs_.device, dtype=inputs_.dtype)
  if self.mask_first_token:
  attention_mask[:,0] = 0
  )
  extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
+ extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(extended_attention_mask.dtype).min
  return extended_attention_mask