add mask_first_token

Files changed (3) hide show

README.md CHANGED Viewed

@@ -51,13 +51,17 @@ You can change various parameters like :
 Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
 ```python:
 model = AutoModel.from_pretrained("ccdv/lsg-distilroberta-base-4096",
     trust_remote_code=True,
     num_global_tokens=16,
     block_size=64,
     sparse_block_size=64,
-    sparsity_factor=4,
     attention_probs_dropout_prob=0.0
 )
 ```

 Default parameters work well in practice. If you are short on memory, reduce block sizes, increase sparsity factor and remove dropout in the attention score matrix.
 ```python:
+from transformers import AutoModel
 model = AutoModel.from_pretrained("ccdv/lsg-distilroberta-base-4096",
     trust_remote_code=True,
     num_global_tokens=16,
     block_size=64,
     sparse_block_size=64,
     attention_probs_dropout_prob=0.0
+    sparsity_factor=4,
+    sparsity_type="none",
+    mask_first_token=True
 )
 ```

config.json CHANGED Viewed

@@ -27,6 +27,7 @@
   "intermediate_size": 3072,
   "layer_norm_eps": 1e-05,
   "lsh_num_pre_rounds": 1,
   "max_position_embeddings": 4098,
   "model_type": "roberta",
   "num_attention_heads": 12,

   "intermediate_size": 3072,
   "layer_norm_eps": 1e-05,
   "lsh_num_pre_rounds": 1,
+  "mask_first_token": false,
   "max_position_embeddings": 4098,
   "model_type": "roberta",
   "num_attention_heads": 12,

modeling_lsg_roberta.py CHANGED Viewed

@@ -30,6 +30,7 @@ class LSGRobertaConfig(RobertaConfig):
         base_model_prefix="lsg",
         block_size=128,
         lsh_num_pre_rounds=1,
         num_global_tokens=1,
         pool_with_global=True,
         sparse_block_size=128,
@@ -45,6 +46,7 @@ class LSGRobertaConfig(RobertaConfig):
         self.base_model_prefix = base_model_prefix
         self.block_size = block_size
         self.lsh_num_pre_rounds = lsh_num_pre_rounds
         self.num_global_tokens = num_global_tokens
         self.pool_with_global = pool_with_global
         self.sparse_block_size = sparse_block_size
@@ -950,6 +952,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
         assert hasattr(config, "block_size") and hasattr(config, "adaptive")
         self.block_size = config.block_size
         self.adaptive = config.adaptive
         self.pool_with_global = config.pool_with_global
         self.embeddings = LSGRobertaEmbeddings(config)
@@ -986,6 +989,8 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
         if attention_mask is None:
             attention_mask = torch.ones(n, t, device=inputs_.device)
         b = self.block_size * 2
         pad = t % self.block_size

         base_model_prefix="lsg",
         block_size=128,
         lsh_num_pre_rounds=1,
+        mask_first_token=False,
         num_global_tokens=1,
         pool_with_global=True,
         sparse_block_size=128,
         self.base_model_prefix = base_model_prefix
         self.block_size = block_size
         self.lsh_num_pre_rounds = lsh_num_pre_rounds
+        self.mask_first_token = mask_first_token
         self.num_global_tokens = num_global_tokens
         self.pool_with_global = pool_with_global
         self.sparse_block_size = sparse_block_size
         assert hasattr(config, "block_size") and hasattr(config, "adaptive")
         self.block_size = config.block_size
         self.adaptive = config.adaptive
+        self.mask_first_token = config.mask_first_token
         self.pool_with_global = config.pool_with_global
         self.embeddings = LSGRobertaEmbeddings(config)
         if attention_mask is None:
             attention_mask = torch.ones(n, t, device=inputs_.device)
+        if self.mask_first_token:
+            attention_mask[:,0] = 0
         b = self.block_size * 2
         pad = t % self.block_size