Upload 3 files
Browse files- config.json +4 -4
- configuration_ltgbert.py +26 -2
- modeling_ltgbert.py +44 -15
config.json
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"LtgBertForMaskedLM"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
"auto_map": {
|
8 |
-
"AutoConfig": "
|
9 |
-
"AutoModelForMaskedLM": "
|
10 |
"AutoModelForSequenceClassification": "modeling_ltgbert.LtgBertForSequenceClassification"
|
11 |
},
|
12 |
"classifier_dropout": 0.2,
|
@@ -22,6 +22,6 @@
|
|
22 |
"pad_token_id": 4,
|
23 |
"position_bucket_size": 32,
|
24 |
"torch_dtype": "float32",
|
25 |
-
"transformers_version": "4.
|
26 |
"vocab_size": 16384
|
27 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "ltg/ltg-bert-babylm",
|
3 |
"architectures": [
|
4 |
"LtgBertForMaskedLM"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
"auto_map": {
|
8 |
+
"AutoConfig": "configuration_ltgbert.LtgBertConfig",
|
9 |
+
"AutoModelForMaskedLM": "modeling_ltgbert.LtgBertForMaskedLM",
|
10 |
"AutoModelForSequenceClassification": "modeling_ltgbert.LtgBertForSequenceClassification"
|
11 |
},
|
12 |
"classifier_dropout": 0.2,
|
|
|
22 |
"pad_token_id": 4,
|
23 |
"position_bucket_size": 32,
|
24 |
"torch_dtype": "float32",
|
25 |
+
"transformers_version": "4.40.2",
|
26 |
"vocab_size": 16384
|
27 |
}
|
configuration_ltgbert.py
CHANGED
@@ -19,6 +19,30 @@
|
|
19 |
from transformers.configuration_utils import PretrainedConfig
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
class LtgBertConfig(PretrainedConfig):
|
23 |
r"""
|
24 |
This is the configuration class to store the configuration of a [`LtgBertModel`]. It is used to
|
@@ -49,7 +73,7 @@ class LtgBertConfig(PretrainedConfig):
|
|
49 |
classifier_dropout (`float`, *optional*):
|
50 |
The dropout ratio for the classification head.
|
51 |
"""
|
52 |
-
model_type = "
|
53 |
def __init__(
|
54 |
self,
|
55 |
vocab_size=16384,
|
@@ -80,4 +104,4 @@ class LtgBertConfig(PretrainedConfig):
|
|
80 |
self.output_all_encoded_layers = output_all_encoded_layers
|
81 |
self.position_bucket_size = position_bucket_size
|
82 |
self.layer_norm_eps = layer_norm_eps
|
83 |
-
self.classifier_dropout = classifier_dropout
|
|
|
19 |
from transformers.configuration_utils import PretrainedConfig
|
20 |
|
21 |
|
22 |
+
LTG_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
23 |
+
"bnc-bert-span": "https://huggingface.co/ltg/bnc-bert-span",
|
24 |
+
"bnc-bert-span-2x": "https://huggingface.co/ltg/bnc-bert-span-2x",
|
25 |
+
"bnc-bert-span-0.5x": "https://huggingface.co/ltg/bnc-bert-span-0.5x",
|
26 |
+
"bnc-bert-span-0.25x": "https://huggingface.co/ltg/bnc-bert-span-0.25x",
|
27 |
+
"bnc-bert-span-order": "https://huggingface.co/ltg/bnc-bert-span-order",
|
28 |
+
"bnc-bert-span-document": "https://huggingface.co/ltg/bnc-bert-span-document",
|
29 |
+
"bnc-bert-span-word": "https://huggingface.co/ltg/bnc-bert-span-word",
|
30 |
+
"bnc-bert-span-subword": "https://huggingface.co/ltg/bnc-bert-span-subword",
|
31 |
+
|
32 |
+
"norbert3-xs": "https://huggingface.co/ltg/norbert3-xs/config.json",
|
33 |
+
"norbert3-small": "https://huggingface.co/ltg/norbert3-small/config.json",
|
34 |
+
"norbert3-base": "https://huggingface.co/ltg/norbert3-base/config.json",
|
35 |
+
"norbert3-large": "https://huggingface.co/ltg/norbert3-large/config.json",
|
36 |
+
|
37 |
+
"norbert3-oversampled-base": "https://huggingface.co/ltg/norbert3-oversampled-base/config.json",
|
38 |
+
"norbert3-ncc-base": "https://huggingface.co/ltg/norbert3-ncc-base/config.json",
|
39 |
+
"norbert3-nak-base": "https://huggingface.co/ltg/norbert3-nak-base/config.json",
|
40 |
+
"norbert3-nb-base": "https://huggingface.co/ltg/norbert3-nb-base/config.json",
|
41 |
+
"norbert3-wiki-base": "https://huggingface.co/ltg/norbert3-wiki-base/config.json",
|
42 |
+
"norbert3-c4-base": "https://huggingface.co/ltg/norbert3-c4-base/config.json"
|
43 |
+
}
|
44 |
+
|
45 |
+
|
46 |
class LtgBertConfig(PretrainedConfig):
|
47 |
r"""
|
48 |
This is the configuration class to store the configuration of a [`LtgBertModel`]. It is used to
|
|
|
73 |
classifier_dropout (`float`, *optional*):
|
74 |
The dropout ratio for the classification head.
|
75 |
"""
|
76 |
+
model_type = "ltgbert"
|
77 |
def __init__(
|
78 |
self,
|
79 |
vocab_size=16384,
|
|
|
104 |
self.output_all_encoded_layers = output_all_encoded_layers
|
105 |
self.position_bucket_size = position_bucket_size
|
106 |
self.layer_norm_eps = layer_norm_eps
|
107 |
+
self.classifier_dropout = classifier_dropout
|
modeling_ltgbert.py
CHANGED
@@ -39,10 +39,34 @@ from transformers.pytorch_utils import softmax_backward_data
|
|
39 |
from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
|
40 |
|
41 |
|
42 |
-
_CHECKPOINT_FOR_DOC = "ltg/
|
43 |
_CONFIG_FOR_DOC = "LtgBertConfig"
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
class Encoder(nn.Module):
|
47 |
def __init__(self, config, activation_checkpointing=False):
|
48 |
super().__init__()
|
@@ -224,8 +248,10 @@ class Attention(nn.Module):
|
|
224 |
|
225 |
attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
|
226 |
|
227 |
-
|
228 |
-
query_pos
|
|
|
|
|
229 |
query = query.view(batch_size, self.num_heads, query_len, self.head_size)
|
230 |
key = key.view(batch_size, self.num_heads, query_len, self.head_size)
|
231 |
|
@@ -367,8 +393,6 @@ class LtgBertModel(LtgBertPreTrainedModel):
|
|
367 |
) -> List[torch.Tensor]:
|
368 |
if input_ids is not None:
|
369 |
input_shape = input_ids.size()
|
370 |
-
# elif inputs_embeds is not None:
|
371 |
-
# input_shape = inputs_embeds.size()[:-1]
|
372 |
else:
|
373 |
raise ValueError("You have to specify input_ids")
|
374 |
|
@@ -380,9 +404,7 @@ class LtgBertModel(LtgBertPreTrainedModel):
|
|
380 |
else:
|
381 |
attention_mask = ~attention_mask.bool()
|
382 |
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
383 |
-
|
384 |
-
# if inputs_embeds is None:
|
385 |
-
# static_embeddings, relative_embedding = self.embedding(input_ids.t())
|
386 |
static_embeddings, relative_embedding = self.embedding(input_ids.t())
|
387 |
contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
|
388 |
contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
|
@@ -409,7 +431,8 @@ class LtgBertModel(LtgBertPreTrainedModel):
|
|
409 |
)
|
410 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
411 |
|
412 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
413 |
|
414 |
if not return_dict:
|
415 |
return (
|
@@ -456,7 +479,8 @@ class LtgBertForMaskedLM(LtgBertModel):
|
|
456 |
"""
|
457 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
458 |
|
459 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
460 |
subword_prediction = self.classifier(sequence_output)
|
461 |
|
462 |
masked_lm_loss = None
|
@@ -554,8 +578,9 @@ class LtgBertForSequenceClassification(LtgBertModel):
|
|
554 |
"""
|
555 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
556 |
|
557 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
558 |
-
|
|
|
559 |
logits = self.head(sequence_output[:, 0, :])
|
560 |
|
561 |
loss = None
|
@@ -628,7 +653,8 @@ class LtgBertForTokenClassification(LtgBertModel):
|
|
628 |
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
|
629 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
630 |
|
631 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
632 |
logits = self.head(sequence_output)
|
633 |
|
634 |
loss = None
|
@@ -684,7 +710,8 @@ class LtgBertForQuestionAnswering(LtgBertModel):
|
|
684 |
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
|
685 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
686 |
|
687 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
688 |
logits = self.head(sequence_output)
|
689 |
|
690 |
start_logits, end_logits = logits.split(1, dim=-1)
|
@@ -762,7 +789,8 @@ class LtgBertForMultipleChoice(LtgBertModel):
|
|
762 |
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
763 |
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
764 |
|
765 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(flat_input_ids,
|
|
|
766 |
logits = self.head(sequence_output)
|
767 |
reshaped_logits = logits.view(-1, num_choices)
|
768 |
|
@@ -785,3 +813,4 @@ class LtgBertForMultipleChoice(LtgBertModel):
|
|
785 |
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
786 |
attentions=attention_probs if output_attentions else None
|
787 |
)
|
|
|
|
39 |
from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
|
40 |
|
41 |
|
42 |
+
_CHECKPOINT_FOR_DOC = "ltg/bnc-bert-span"
|
43 |
_CONFIG_FOR_DOC = "LtgBertConfig"
|
44 |
|
45 |
|
46 |
+
LTG_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
47 |
+
"bnc-bert-span",
|
48 |
+
"bnc-bert-span-2x",
|
49 |
+
"bnc-bert-span-0.5x",
|
50 |
+
"bnc-bert-span-0.25x",
|
51 |
+
"bnc-bert-span-order",
|
52 |
+
"bnc-bert-span-document",
|
53 |
+
"bnc-bert-span-word",
|
54 |
+
"bnc-bert-span-subword",
|
55 |
+
|
56 |
+
"norbert3-xs",
|
57 |
+
"norbert3-small",
|
58 |
+
"norbert3-base",
|
59 |
+
"norbert3-large",
|
60 |
+
|
61 |
+
"norbert3-oversampled-base",
|
62 |
+
"norbert3-ncc-base",
|
63 |
+
"norbert3-nak-base",
|
64 |
+
"norbert3-nb-base",
|
65 |
+
"norbert3-wiki-base",
|
66 |
+
"norbert3-c4-base"
|
67 |
+
]
|
68 |
+
|
69 |
+
|
70 |
class Encoder(nn.Module):
|
71 |
def __init__(self, config, activation_checkpointing=False):
|
72 |
super().__init__()
|
|
|
248 |
|
249 |
attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
|
250 |
|
251 |
+
query_pos, key_pos = self.in_proj_qk(self.dropout(relative_embedding)).chunk(2, dim=-1) # shape: [2T-1, D]
|
252 |
+
query_pos = query_pos.view(-1, self.num_heads, self.head_size) # shape: [2T-1, H, D]
|
253 |
+
key_pos = key_pos.view(-1, self.num_heads, self.head_size) # shape: [2T-1, H, D]
|
254 |
+
|
255 |
query = query.view(batch_size, self.num_heads, query_len, self.head_size)
|
256 |
key = key.view(batch_size, self.num_heads, query_len, self.head_size)
|
257 |
|
|
|
393 |
) -> List[torch.Tensor]:
|
394 |
if input_ids is not None:
|
395 |
input_shape = input_ids.size()
|
|
|
|
|
396 |
else:
|
397 |
raise ValueError("You have to specify input_ids")
|
398 |
|
|
|
404 |
else:
|
405 |
attention_mask = ~attention_mask.bool()
|
406 |
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
407 |
+
|
|
|
|
|
408 |
static_embeddings, relative_embedding = self.embedding(input_ids.t())
|
409 |
contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
|
410 |
contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
|
|
|
431 |
)
|
432 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
433 |
|
434 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
435 |
+
attention_mask=attention_mask)
|
436 |
|
437 |
if not return_dict:
|
438 |
return (
|
|
|
479 |
"""
|
480 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
481 |
|
482 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
483 |
+
attention_mask=attention_mask)
|
484 |
subword_prediction = self.classifier(sequence_output)
|
485 |
|
486 |
masked_lm_loss = None
|
|
|
578 |
"""
|
579 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
580 |
|
581 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
582 |
+
inputs_embeds=inputs_embeds,
|
583 |
+
attention_mask=attention_mask)
|
584 |
logits = self.head(sequence_output[:, 0, :])
|
585 |
|
586 |
loss = None
|
|
|
653 |
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
|
654 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
655 |
|
656 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
657 |
+
attention_mask=attention_mask)
|
658 |
logits = self.head(sequence_output)
|
659 |
|
660 |
loss = None
|
|
|
710 |
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
|
711 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
712 |
|
713 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
714 |
+
attention_mask=attention_mask)
|
715 |
logits = self.head(sequence_output)
|
716 |
|
717 |
start_logits, end_logits = logits.split(1, dim=-1)
|
|
|
789 |
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
790 |
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
791 |
|
792 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=flat_input_ids,
|
793 |
+
attention_mask=flat_attention_mask)
|
794 |
logits = self.head(sequence_output)
|
795 |
reshaped_logits = logits.view(-1, num_choices)
|
796 |
|
|
|
813 |
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
814 |
attentions=attention_probs if output_attentions else None
|
815 |
)
|
816 |
+
|