File size: 31,402 Bytes
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
 
 
 
 
 
 
96c0ca2
 
 
 
ca6b592
 
 
 
 
 
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
96c0ca2
 
 
 
ca6b592
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
 
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
 
96c0ca2
 
 
 
 
 
ca6b592
96c0ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6b592
 
 
 
96c0ca2
ca6b592
 
96c0ca2
 
ca6b592
 
 
96c0ca2
ca6b592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96c0ca2
ca6b592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96c0ca2
ca6b592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96c0ca2
ca6b592
96c0ca2
 
 
 
 
 
ca6b592
 
 
 
 
 
96c0ca2
 
 
ca6b592
96c0ca2
ca6b592
 
 
 
96c0ca2
ca6b592
96c0ca2
ca6b592
 
96c0ca2
 
 
 
ca6b592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96c0ca2
 
 
 
 
 
 
ca6b592
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
#!/usr/bin/env python
# encoding: utf-8

from .loss import *
from .model_utils import AllOutput, create_output_loss_lucagplm
from .alphabet import Alphabet
from .modeling_gplm import *
from .lucaone_gplm_config import LucaGPLMConfig
from transformers import PreTrainedModel

class LucaGPLM(PreTrainedModel):
    config_class = LucaGPLMConfig

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.max_position_embeddings = config.max_position_embeddings
        self.type_vocab_size = config.type_vocab_size
        self.num_layers = config.num_hidden_layers
        self.embed_dim = config.hidden_size
        self.attention_heads = config.num_attention_heads
        self.no_position_embeddings = config.no_position_embeddings
        self.no_token_type_embeddings = config.no_token_type_embeddings
        if not isinstance(config.alphabet, Alphabet):
            self.alphabet = Alphabet.from_predefined(config.alphabet)
        else:
            self.alphabet = config.alphabet
        self.alphabet_size = len(self.alphabet)
        self.padding_idx = self.alphabet.padding_idx
        self.mask_idx = self.alphabet.mask_idx
        self.cls_idx = self.alphabet.cls_idx
        self.eos_idx = self.alphabet.eos_idx
        self.prepend_bos = self.alphabet.prepend_bos
        self.append_eos = self.alphabet.append_eos
        self.token_dropout = config.token_dropout
        self.ignore_index = config.ignore_index
        self.use_embed_layer_norm = config.use_embed_layer_norm
        self.use_last_layer_norm = config.use_last_layer_norm
        self.embed_scale = config.embed_scale
        self.embedding_inference = True
        self._init_submodules()

    def _init_submodules(self):
        # normal_(0, 1)
        self.embed_tokens = nn.Embedding(
            self.alphabet_size,
            self.embed_dim,
            padding_idx=self.padding_idx,
        )
        self.embed_pos = None
        if not self.no_position_embeddings:
            self.embed_pos = nn.Embedding(self.max_position_embeddings, self.embed_dim)
        self.embed_type = None
        if not self.no_token_type_embeddings:
            self.embed_type = nn.Embedding(self.type_vocab_size, self.embed_dim)
        if self.use_embed_layer_norm:
            self.embed_layer_norm = LucaGPLM1bLayerNorm(self.embed_dim)
        else:
            self.embed_layer_norm = None

        self.layers = nn.ModuleList(
            [
                LucaGPLMTransformerLayer(
                    self.embed_dim,
                    4 * self.embed_dim,
                    self.attention_heads,
                    add_bias_kv=False,
                    use_lucagplm1b_layer_norm=True,
                    use_rotary_embeddings=True,
                    )
                for _ in range(self.num_layers)
            ]
        )
        self.layer_size = len(self.layers)

        if not self.embedding_inference:
            self.contact_head = ContactPredictionHead(
                self.num_layers * self.attention_heads,
                self.prepend_bos,
                self.append_eos,
                eos_idx=self.eos_idx,
                )
        if self.use_last_layer_norm:
            self.last_layer_norm = LucaGPLM1bLayerNorm(self.embed_dim)
        else:
            self.last_layer_norm = None
        if not self.embedding_inference:
            self.lm_head = RobertaLMHead(
                embed_dim=self.embed_dim,
                output_dim=self.alphabet_size,
                weight=self.embed_tokens.weight,
            )

    def _init_embedding(self, pretrained_token_matrix, token_matrix):
        '''

        0->2

        1->0

        2->3

        3->1

        4->10

        ...

        28->34

        29->36

        30->37

        31->38

        32->4

        '''
        # print("Load pretrained exists embedding vectors:")
        token_matrix[2, :] = pretrained_token_matrix[0, :]
        token_matrix[0, :] = pretrained_token_matrix[1, :]
        token_matrix[3, :] = pretrained_token_matrix[2, :]
        token_matrix[1, :] = pretrained_token_matrix[3, :]
        for idx in range(10, 35):
            token_matrix[idx, :] = pretrained_token_matrix[idx - 6, :]
        token_matrix[36, :] = pretrained_token_matrix[29, :]
        token_matrix[37, :] = pretrained_token_matrix[30, :]
        token_matrix[38, :] = pretrained_token_matrix[31, :]
        token_matrix[4, :] = pretrained_token_matrix[32, :]
        return token_matrix

    def _init_submodules_new(self, pretrained_model_name):
        # print("Load pretrained model exists weights:")
        from esm import pretrained
        from collections import OrderedDict
        pretrained, _ = pretrained.load_model_and_alphabet(pretrained_model_name)
        pretrained_state_dict = pretrained.state_dict()
        new_state_dict = OrderedDict()
        our_model_state_dict = {}
        for key, value in self.state_dict().items():
            our_model_state_dict[key] = value
        for name, weight in pretrained_state_dict.items():
            if "final_layer_norm" in name:
                name = name.replace("final_layer_norm", "post_layer_norm")
            elif "self_attn_layer_norm" in name:
                name = name.replace("self_attn_layer_norm", "pre_layer_norm")
            elif "emb_layer_norm_after" in name:
                name = name.replace("emb_layer_norm_after", "last_layer_norm")
            if name.startswith("layers."):
                layer_id = name.split(".")[1]
                if int(layer_id) >= self.num_layers:
                    continue
            if name == "embed_tokens.weight":
                new_state_dict[name] = self._init_embedding(weight, our_model_state_dict[name])
                del our_model_state_dict[name]
            elif name in our_model_state_dict and our_model_state_dict[name].shape == weight.shape:
                del our_model_state_dict[name]
                new_state_dict[name] = weight
        '''

        print("Exists layer names:")

        print(new_state_dict.keys())

        print("Not exists Layer names:")

        print(our_model_state_dict.keys())

        '''
        new_state_dict.update(our_model_state_dict)
        self.load_state_dict(new_state_dict)

    def __calc_loss__(self, task_level_type, output_mode, logits, label, label_size, loss_fct, loss_reduction):
        if output_mode in ["regression"]:
            if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
                # structure-level regression
                # logits: N, seq_len, 3
                # label: N, seq_len, 3
                loss = loss_fct(logits, label)
            else:
                # structure-level regression
                # logits: N * seq_len * 3
                # label: N * seq_len * 3
                loss = loss_fct(logits.view(-1), label.view(-1))
        elif output_mode in ["multi_label", "multi-label"]:
            # only for seq-level
            if loss_reduction == "meanmean":
                # logits: N , label_size
                # label: N , label_size
                loss = loss_fct(logits, label.float())
            else:
                # logits: N , label_size
                # label: N , label_size
                loss = loss_fct(logits.view(-1, label_size), label.view(-1, label_size).float())
        elif label_size <= 2 or output_mode in ["binary_class", "binary-class"]:
            if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
                # token-level & meanmean
                # logits: N ,seq_len, 1
                # label: N, seq_len
                loss = loss_fct(logits, label.float())
            else:
                # seq-level || token-level
                # logits: N
                # label: N
                loss = loss_fct(logits.view(-1), label.view(-1).float())
        elif output_mode in ["multi_class", "multi-class"]:
            if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
                # token-level
                # logits: N ,seq_len, label_size
                # label: N , seq_len
                loss = loss_fct(logits, label)
            else:
                # token-level
                # logits: N * seq_len, label_size
                # label: N * seq_len
                # seq-level
                # logits: N, label_size
                # label: N
                loss = loss_fct(logits.view(-1, label_size), label.view(-1))
        else:
            raise Exception("Not support output_mode=%s" % output_mode)
        return loss

    def __forword__(self,

                    input_ids: Optional[torch.Tensor] = None,

                    attention_mask: Optional[torch.Tensor] = None,

                    token_type_ids: Optional[torch.Tensor] = None,

                    position_ids: Optional[torch.Tensor] = None,

                    output_keys: Optional[dict[str, set[str]]] = None,

                    labels: Optional[dict[str, dict[str, torch.Tensor]]] = None,

                    repr_layers=[-1],

                    need_head_weights=False,

                    return_contacts=False,

                    use_last_layer_norm=True):
        assert all(-(self.layer_size + 1) <= i <= self.layer_size for i in repr_layers)
        repr_layers = [(i + self.layer_size + 1) % (self.layer_size + 1) for i in repr_layers]

        if return_contacts:
            need_head_weights = True

        assert input_ids.ndim == 2
        # 动态求mask,(B * Seq_len) 被mask掉位置的值为True
        if attention_mask is None:
            padding_mask = input_ids.eq(self.padding_idx)
        else:
            padding_mask = attention_mask.eq(self.padding_idx)

        x = self.embed_scale * self.embed_tokens(input_ids)
        if self.embed_pos is not None and position_ids is not None:
            x += self.embed_scale * self.embed_pos(position_ids)
        if self.embed_type is not None and token_type_ids is not None:
            x += self.embed_scale * self.embed_type(token_type_ids)
        if self.embed_layer_norm is not None:
            x = self.embed_layer_norm(x)
        # Token dropout
        if self.token_dropout:
            x.masked_fill_((input_ids == self.mask_idx).unsqueeze(-1), 0.0)
            # x: B x L x C
            mask_ratio_train = 0.15 * 0.8
            src_lengths = (~padding_mask).sum(-1)
            mask_ratio_observed = (input_ids == self.mask_idx).sum(-1).to(x.dtype) / src_lengths
            x = x * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]

        # Mask 操作
        if padding_mask is not None:
            x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))

        # 返回值包括哪些
        repr_layers = set(repr_layers)
        hidden_representations = {}
        # 0:embedding
        if 0 in repr_layers:
            hidden_representations[0] = x

        # 是否需要返回head weights
        if need_head_weights:
            attn_weights = []

        # (B, L, E) => (L, B, E)
        x = x.transpose(0, 1)

        if not padding_mask.any():
            padding_mask = None

        for layer_idx, layer in enumerate(self.layers):
            x, attn = layer(
                x,
                self_attn_padding_mask=padding_mask,
                need_head_weights=need_head_weights,
            )
            if (layer_idx + 1) in repr_layers:
                hidden_representations[layer_idx + 1] = x.transpose(0, 1)
            if need_head_weights:
                # (H, B, L, L) => (B, H, L, L)
                attn_weights.append(attn.transpose(1, 0))

        # (L, B, E)
        if self.last_layer_norm is not None and use_last_layer_norm:
            # 最后一层隐含层 加一层layernorm
            x = self.last_layer_norm(x)
        x = x.transpose(0, 1)  # (L, B, E) => (B, L,  E)

        # last hidden representation should have layer norm applied
        if (layer_idx + 1) in repr_layers:
            hidden_representations[layer_idx + 1] = x
        # 最后一层作为表征矩阵
        # (B, L, E)
        representation_matrix = hidden_representations[self.layer_size]
        # mask 任务
        # B * Seq_len * vocab_size
        if not self.embedding_inference:
            lm_mask_logits = self.lm_head(x)
        # lm head的输出向量作为表征向量
        # (B, E)
        representation_vector = representation_matrix[:, 0, :]

        logits = {}
        losses = {}
        outputs = {}
        representations = {
            "representation_matrix": representation_matrix,
            "representation_vector": representation_vector
        }
        # 每一层的attention值
        if need_head_weights:
            # attentions: B x Layers x H x L x L
            attentions = torch.stack(attn_weights, 1)
            if padding_mask is not None:
                attention_mask = 1 - padding_mask.type_as(attentions)
                attention_mask = attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2)
                attentions = attentions * attention_mask[:, None, None, :, :]
            representations["attentions"] = attentions
            # 预测contact矩阵
            if return_contacts and hasattr(self, "contact_head") \
                    and not self.embedding_inference:
                contacts = self.contact_head(input_ids, attentions)
                representations["contacts"] = contacts
        '''

        print("output_keys:")

        print(output_keys)

        '''
        if not self.embedding_inference and output_keys:
            for item in output_keys.items():
                cur_task_level_type = item[0]
                if cur_task_level_type not in logits:
                    logits[cur_task_level_type] = {}
                    outputs[cur_task_level_type] = {}
                for cur_task_level_name in item[1]:
                    if cur_task_level_type == "token_level":
                        cur_logits = lm_mask_logits
                    elif cur_task_level_type == "seq_level":
                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](representation_vector)
                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
                        if cur_hidden_layer is not None:
                            cur_logits = cur_hidden_layer(cur_logits)
                        cur_hidden_act = self.hidden_act[cur_task_level_type][cur_task_level_name]
                        if cur_hidden_act is not None:
                            cur_logits = cur_hidden_act(cur_logits)
                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
                    elif cur_task_level_type == "span_level":
                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](representation_matrix)
                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
                        if cur_hidden_layer is not None:
                            cur_logits = cur_hidden_layer(cur_logits)
                        cur_hidden_act = self.hidden_act[cur_task_level_type][cur_task_level_name]
                        if cur_hidden_act is not None:
                            cur_logits = cur_hidden_act(cur_logits)
                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
                    elif cur_task_level_type == "structure_level":
                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](representation_matrix)
                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
                        if cur_hidden_layer is not None:
                            cur_logits = cur_hidden_layer(cur_logits)
                        cur_hidden_act = self.hidden_act[cur_task_level_type][cur_task_level_name]
                        if cur_hidden_act is not None:
                            cur_logits = cur_hidden_act(cur_logits)
                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
                    logits[cur_task_level_type][cur_task_level_name] = cur_logits
                    if cur_task_level_type in self.output and cur_task_level_name in self.output[cur_task_level_type] \
                            and self.output[cur_task_level_type][cur_task_level_name] is not None:
                        outputs[cur_task_level_type][cur_task_level_name] = self.output[cur_task_level_type][cur_task_level_name](cur_logits)
                    else:
                        outputs[cur_task_level_type][cur_task_level_name] = cur_logits
                    if labels is not None and cur_task_level_type in labels and cur_task_level_name in labels[cur_task_level_type]:
                        if cur_task_level_type not in losses:
                            losses[cur_task_level_type] = {}
                        cur_label = labels[cur_task_level_type][cur_task_level_name]
                        cur_label_size = self.label_size[cur_task_level_type][cur_task_level_name]
                        cur_output_mode = self.output_mode[cur_task_level_type][cur_task_level_name]
                        cur_loss_fct = self.loss_fct[cur_task_level_type][cur_task_level_name]
                        cur_loss = self.__calc_loss__(
                                                      task_level_type=cur_task_level_type,
                                                      output_mode=cur_output_mode,
                                                      logits=cur_logits,
                                                      label=cur_label,
                                                      label_size=cur_label_size,
                                                      loss_fct=cur_loss_fct,
                                                      loss_reduction="meanmean")
                        losses[cur_task_level_type][cur_task_level_name] = cur_loss
        return representations, logits, outputs, losses

    def forward(

            self,

            input_ids: Optional[torch.Tensor] = None,

            attention_mask: Optional[torch.Tensor] = None,

            global_attention_mask: Optional[torch.Tensor] = None,

            token_type_ids: Optional[torch.Tensor] = None,

            position_ids: Optional[torch.Tensor] = None,

            head_mask: Optional[torch.Tensor] = None,

            inputs_embeds: Optional[torch.Tensor] = None,

            output_keys: Optional[dict[str, set[str]]] = None,

            labels: Optional[dict[str, dict[str, torch.Tensor]]] = None,

            input_ids_b: Optional[torch.Tensor] = None,

            attention_mask_b: Optional[torch.Tensor] = None,

            global_attention_mask_b: Optional[torch.Tensor] = None,

            token_type_ids_b: Optional[torch.Tensor] = None,

            position_ids_b: Optional[torch.Tensor] = None,

            head_mask_b: Optional[torch.Tensor] = None,

            inputs_embeds_b: Optional[torch.Tensor] = None,

            output_keys_b: Optional[dict[str, set[str]]] = None,

            labels_b: Optional[dict[str, dict[str, torch.Tensor]]] = None,

            pair_label: Optional[dict[str, dict[str, torch.Tensor]]] = None,

            pair_output_keys: Optional[dict[str, set[str]]] = None,

            output_hidden_states: Optional[dict[str, set[str]]] = None,

            output_attentions: Optional[dict[str, set[str]]] = None,

            need_head_weights: Optional[bool] = None,

            return_contacts: Optional[bool] = None,

            repr_layers: Optional[list[int]] = None,

            return_dict: Optional[bool] = None,

            use_last_layer_norm: Optional[bool] = True

    ) -> Union[Tuple[torch.Tensor], AllOutput]:
        if return_dict is None and self.config is not None:
            return_dict = self.config.use_return_dict
        if return_dict is None:
            return_dict = False
        if repr_layers is None or len(repr_layers) == 0:
            repr_layers = [-1]
        if return_contacts is None:
            return_contacts = False
        if need_head_weights is None:
            need_head_weights = True
        has_pair = False
        has_pair_b = False
        if input_ids is not None or inputs_embeds is not None:
            encoding, logits, outputs, losses = self.__forword__(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                output_keys=output_keys,
                labels=labels,
                repr_layers=repr_layers,
                need_head_weights=need_head_weights,
                return_contacts=return_contacts,
                use_last_layer_norm=use_last_layer_norm
            )
            has_pair = True
        if input_ids_b is not None or inputs_embeds_b is not None:
            encoding_b, logits_b, outputs_b, losses_b = self.__forword__(
                input_ids=input_ids_b,
                attention_mask=attention_mask_b,
                token_type_ids=token_type_ids_b,
                position_ids=position_ids_b,
                output_keys=output_keys_b,
                labels=labels_b,
                repr_layers=repr_layers,
                need_head_weights=need_head_weights,
                return_contacts=return_contacts,
                use_last_layer_norm=use_last_layer_norm
            )
            has_pair_b = True

        if not self.embedding_inference:
            if has_pair and has_pair_b and pair_output_keys and len(pair_output_keys) > 0:
                cur_representation_vector = encoding["representation_vector"]
                cur_representation_vector_b = encoding_b["representation_vector"]

                pair_logits = {}
                pair_outputs = {}
                for item1 in pair_output_keys.items():
                    cur_task_level_type = item1[0]
                    if cur_task_level_type not in pair_outputs:
                        pair_outputs[cur_task_level_type] = {}
                        pair_logits[cur_task_level_type] = {}
                    for cur_task_level_name in item1[1]:
                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](
                            torch.cat((cur_representation_vector, cur_representation_vector_b), dim=-1)
                        )
                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
                        if cur_hidden_layer is not None:
                            cur_logits = cur_hidden_layer(cur_logits)
                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
                        pair_logits[cur_task_level_type][cur_task_level_name] = cur_logits
                        pair_outputs[cur_task_level_type][cur_task_level_name] = self.output[cur_task_level_type][cur_task_level_name](cur_logits)

                if pair_label is not None:
                    pair_loss = {}
                    for item1 in pair_output_keys.items():
                        cur_task_level_type = item1[0]
                        if cur_task_level_type not in pair_label:
                            continue
                        if cur_task_level_type in pair_label:
                            pair_loss[cur_task_level_type] = {}
                        for cur_task_level_name in item1[1]:
                            if cur_task_level_name not in pair_label[cur_task_level_type]:
                                continue
                            cur_label = pair_label[cur_task_level_type][cur_task_level_name]
                            cur_label_size = self.label_size[cur_task_level_type][cur_task_level_name]
                            cur_output_mode = self.output_mode[cur_task_level_type][cur_task_level_name]
                            cur_loss_fct = self.loss_fct[cur_task_level_type][cur_task_level_name]
                            cur_logits = pair_logits[cur_task_level_type][cur_task_level_name]
                            cur_loss = self.__calc_loss__(
                                task_level_type=cur_task_level_type,
                                output_mode=cur_output_mode, logits=cur_logits,
                                label=cur_label, label_size=cur_label_size, loss_fct=cur_loss_fct,
                                loss_reduction="meanmean")
                            pair_loss[cur_task_level_type][cur_task_level_name] = cur_loss

                    if not return_dict:
                        return [[losses, losses_b, pair_loss], [outputs, outputs_b, pair_outputs]] + [[encoding, encoding_b]]
                    return AllOutput(
                        losses=losses,
                        outputs=outputs,
                        hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                        attentions=encoding["attentions"] if "attentions" in encoding else None,
                        global_attentions=None,
                        contacts=encoding["contacts"] if "contacts" in encoding else None,
                        losses_b=losses_b,
                        outputs_b=outputs_b,
                        hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
                        attentions_b=encoding_b["attentions"] if "hidden_states" in encoding_b else None,
                        global_attentions_b=None,
                        contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None,
                        pair_outputs=pair_outputs,
                        pair_losses=pair_loss)
                else:
                    if not return_dict:
                        return [[losses, losses_b], [outputs, outputs_b]] + [[encoding, encoding_b]]
                    return AllOutput(
                        losses=losses,
                        outputs=outputs,
                        hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                        attentions=encoding["attentions"] if "attentions" in encoding else None,
                        global_attentions=None,
                        contacts=encoding["contacts"] if "contacts" in encoding else None,
                        losses_b=losses_b,
                        outputs_b=outputs_b,
                        hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
                        attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
                        global_attentions_b=None,
                        contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
                    )
            elif has_pair:
                if not return_dict:
                    return [[losses], [outputs], [encoding]]
                return AllOutput(
                    losses=losses,
                    outputs=outputs,
                    hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                    attentions=encoding["attentions"] if "attentions" in encoding else None,
                    global_attentions=None,
                    contacts=encoding["contacts"] if "contacts" in encoding else None
                )
            else:
                if not return_dict:
                    return [[losses_b], [outputs_b], [encoding_b]]
                return AllOutput(
                    losses_b=losses_b,
                    outputs_b=outputs_b,
                    hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
                    attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
                    global_attentions_b=None,
                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
                )
        else:
            if has_pair and has_pair_b:
                if not return_dict:
                    return [[None, None], [None, None]] + [[encoding, encoding_b]]
                return AllOutput(
                    losses=None,
                    outputs=None,
                    hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                    attentions=encoding["attentions"] if "attentions" in encoding else None,
                    global_attentions=None,
                    contacts=encoding["contacts"] if "contacts" in encoding else None,
                    losses_b=None,
                    outputs_b=None,
                    hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
                    attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
                    global_attentions_b=None,
                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
                )
            elif has_pair:
                if not return_dict:
                    return [[None], [None], [encoding]]
                return AllOutput(
                    losses=None,
                    outputs=None,
                    hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                    attentions=encoding["attentions"] if "attentions" in encoding else None,
                    global_attentions=None,
                    contacts=encoding["contacts"] if "contacts" in encoding else None
                )
            else:
                if not return_dict:
                    return [[None], [None], [encoding_b]]
                return AllOutput(
                    losses_b=None,
                    outputs_b=None,
                    hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
                    attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
                    global_attentions_b=None,
                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
                )

    def predict_contacts(self, input_ids, position_ids=None, token_type_ids=None):
        return self(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            return_contacts=True)["contacts"]