ccdv commited on
Commit
0b14682
1 Parent(s): 0bbee42

add mask_first_token

Browse files
Files changed (3) hide show
  1. README.md +5 -1
  2. config.json +1 -0
  3. modeling_lsg_roberta.py +5 -0
README.md CHANGED
@@ -51,13 +51,17 @@ You can change various parameters like :
51
  Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
52
 
53
  ```python:
 
 
54
  model = AutoModel.from_pretrained("ccdv/lsg-distilroberta-base-4096",
55
  trust_remote_code=True,
56
  num_global_tokens=16,
57
  block_size=64,
58
  sparse_block_size=64,
59
- sparsity_factor=4,
60
  attention_probs_dropout_prob=0.0
 
 
 
61
  )
62
  ```
63
 
 
51
  Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
52
 
53
  ```python:
54
+ from transformers import AutoModel
55
+
56
  model = AutoModel.from_pretrained("ccdv/lsg-distilroberta-base-4096",
57
  trust_remote_code=True,
58
  num_global_tokens=16,
59
  block_size=64,
60
  sparse_block_size=64,
 
61
  attention_probs_dropout_prob=0.0
62
+ sparsity_factor=4,
63
+ sparsity_type="none",
64
+ mask_first_token=True
65
  )
66
  ```
67
 
config.json CHANGED
@@ -27,6 +27,7 @@
27
  "intermediate_size": 3072,
28
  "layer_norm_eps": 1e-05,
29
  "lsh_num_pre_rounds": 1,
 
30
  "max_position_embeddings": 4098,
31
  "model_type": "roberta",
32
  "num_attention_heads": 12,
 
27
  "intermediate_size": 3072,
28
  "layer_norm_eps": 1e-05,
29
  "lsh_num_pre_rounds": 1,
30
+ "mask_first_token": false,
31
  "max_position_embeddings": 4098,
32
  "model_type": "roberta",
33
  "num_attention_heads": 12,
modeling_lsg_roberta.py CHANGED
@@ -30,6 +30,7 @@ class LSGRobertaConfig(RobertaConfig):
30
  base_model_prefix="lsg",
31
  block_size=128,
32
  lsh_num_pre_rounds=1,
 
33
  num_global_tokens=1,
34
  pool_with_global=True,
35
  sparse_block_size=128,
@@ -45,6 +46,7 @@ class LSGRobertaConfig(RobertaConfig):
45
  self.base_model_prefix = base_model_prefix
46
  self.block_size = block_size
47
  self.lsh_num_pre_rounds = lsh_num_pre_rounds
 
48
  self.num_global_tokens = num_global_tokens
49
  self.pool_with_global = pool_with_global
50
  self.sparse_block_size = sparse_block_size
@@ -950,6 +952,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
950
  assert hasattr(config, "block_size") and hasattr(config, "adaptive")
951
  self.block_size = config.block_size
952
  self.adaptive = config.adaptive
 
953
  self.pool_with_global = config.pool_with_global
954
 
955
  self.embeddings = LSGRobertaEmbeddings(config)
@@ -986,6 +989,8 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
986
 
987
  if attention_mask is None:
988
  attention_mask = torch.ones(n, t, device=inputs_.device)
 
 
989
 
990
  b = self.block_size * 2
991
  pad = t % self.block_size
 
30
  base_model_prefix="lsg",
31
  block_size=128,
32
  lsh_num_pre_rounds=1,
33
+ mask_first_token=False,
34
  num_global_tokens=1,
35
  pool_with_global=True,
36
  sparse_block_size=128,
 
46
  self.base_model_prefix = base_model_prefix
47
  self.block_size = block_size
48
  self.lsh_num_pre_rounds = lsh_num_pre_rounds
49
+ self.mask_first_token = mask_first_token
50
  self.num_global_tokens = num_global_tokens
51
  self.pool_with_global = pool_with_global
52
  self.sparse_block_size = sparse_block_size
 
952
  assert hasattr(config, "block_size") and hasattr(config, "adaptive")
953
  self.block_size = config.block_size
954
  self.adaptive = config.adaptive
955
+ self.mask_first_token = config.mask_first_token
956
  self.pool_with_global = config.pool_with_global
957
 
958
  self.embeddings = LSGRobertaEmbeddings(config)
 
989
 
990
  if attention_mask is None:
991
  attention_mask = torch.ones(n, t, device=inputs_.device)
992
+ if self.mask_first_token:
993
+ attention_mask[:,0] = 0
994
 
995
  b = self.block_size * 2
996
  pad = t % self.block_size