Upload 7 files

Browse files

Files changed (4) hide show

alphabet.py +0 -9
lucaone_gplm.py +142 -109
modeling_bert.py +27 -33
modeling_gplm.py +26 -75

alphabet.py CHANGED Viewed

@@ -6,7 +6,6 @@ import json
 import itertools
 from typing import Sequence, List
 from transformers import PreTrainedTokenizer
-from .batch_converter import BatchConverter
 gene_standard_toks = ['1', '2', '3', '4', '5', '.', '-', '*']
@@ -63,14 +62,6 @@ class Alphabet(object):
     def to_dict(self):
         return self.tok_to_idx.copy()
-    def get_batch_converter(self, no_position_embeddings, no_token_type_embeddings, truncation_seq_length: int = None, ignore_index: int = -100, mlm_probability=0.15):
-        return BatchConverter(self,
-                              no_position_embeddings=no_position_embeddings,
-                              no_token_type_embeddings=no_token_type_embeddings,
-                              truncation_seq_length=truncation_seq_length,
-                              ignore_index=ignore_index,
-                              mlm_probability=mlm_probability)
     @classmethod
     def from_predefined(cls, name: str):
         if name.lower() == "prot":

 import itertools
 from typing import Sequence, List
 from transformers import PreTrainedTokenizer
 gene_standard_toks = ['1', '2', '3', '4', '5', '.', '-', '*']
     def to_dict(self):
         return self.tok_to_idx.copy()
     @classmethod
     def from_predefined(cls, name: str):
         if name.lower() == "prot":

lucaone_gplm.py CHANGED Viewed

@@ -37,6 +37,7 @@ class LucaGPLM(PreTrainedModel):
         self.use_embed_layer_norm = config.use_embed_layer_norm
         self.use_last_layer_norm = config.use_last_layer_norm
         self.embed_scale = config.embed_scale
         self._init_submodules()
     def _init_submodules(self):
@@ -72,22 +73,23 @@ class LucaGPLM(PreTrainedModel):
         )
         self.layer_size = len(self.layers)
-        self.contact_head = ContactPredictionHead(
-            self.num_layers * self.attention_heads,
-            self.prepend_bos,
-            self.append_eos,
-            eos_idx=self.eos_idx,
-            )
         if self.use_last_layer_norm:
             self.last_layer_norm = LucaGPLM1bLayerNorm(self.embed_dim)
         else:
             self.last_layer_norm = None
-        self.lm_head = RobertaLMHead(
-            embed_dim=self.embed_dim,
-            output_dim=self.alphabet_size,
-            weight=self.embed_tokens.weight,
-        )
     def _init_embedding(self, pretrained_token_matrix, token_matrix):
         '''
@@ -103,7 +105,7 @@ class LucaGPLM(PreTrainedModel):
         31->38
         32->4
         '''
-        print("Load pretrained exsists embedding vectors:")
         token_matrix[2, :] = pretrained_token_matrix[0, :]
         token_matrix[0, :] = pretrained_token_matrix[1, :]
         token_matrix[3, :] = pretrained_token_matrix[2, :]
@@ -117,7 +119,7 @@ class LucaGPLM(PreTrainedModel):
         return token_matrix
     def _init_submodules_new(self, pretrained_model_name):
-        print("Load pretrained model exists weights:")
         from esm import pretrained
         from collections import OrderedDict
         pretrained, _ = pretrained.load_model_and_alphabet(pretrained_model_name)
@@ -143,33 +145,16 @@ class LucaGPLM(PreTrainedModel):
             elif name in our_model_state_dict and our_model_state_dict[name].shape == weight.shape:
                 del our_model_state_dict[name]
                 new_state_dict[name] = weight
         print("Exists layer names:")
         print(new_state_dict.keys())
         print("Not exists Layer names:")
         print(our_model_state_dict.keys())
         new_state_dict.update(our_model_state_dict)
         self.load_state_dict(new_state_dict)
     def __calc_loss__(self, task_level_type, output_mode, logits, label, label_size, loss_fct, loss_reduction):
-        '''
-        if label_size <= 2 or output_mode in ["binary_class", "binary-class"]:
-            loss = loss_fct(logits.view(-1), label.view(-1).float())
-        elif output_mode in ["multi_label", "multi-label"]:
-            loss = loss_fct(logits.view(-1, label_size), label.view(-1, label_size).float())
-        elif output_mode in ["multi_class", "multi-class"]:
-            loss = loss_fct(logits.view(-1, label_size), label.view(-1))
-        else:
-            loss = loss_fct(logits.view(-1), label.view(-1))
-        return loss
-        '''
-        '''
-        print(task_level_type, output_mode, label_size, loss_fct, loss_reduction)
-        print("logits:")
-        print(logits.shape)
-        print("label:")
-        print(label.shape)
-        '''
         if output_mode in ["regression"]:
             if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
                 # structure-level regression
@@ -307,7 +292,8 @@ class LucaGPLM(PreTrainedModel):
         representation_matrix = hidden_representations[self.layer_size]
         # mask 任务
         # B * Seq_len * vocab_size
-        lm_mask_logits = self.lm_head(x)
         # lm head的输出向量作为表征向量
         # (B, E)
         representation_vector = representation_matrix[:, 0, :]
@@ -329,14 +315,15 @@ class LucaGPLM(PreTrainedModel):
                 attentions = attentions * attention_mask[:, None, None, :, :]
             representations["attentions"] = attentions
             # 预测contact矩阵
-            if return_contacts:
                 contacts = self.contact_head(input_ids, attentions)
                 representations["contacts"] = contacts
         '''
         print("output_keys:")
         print(output_keys)
         '''
-        if output_keys:
             for item in output_keys.items():
                 cur_task_level_type = item[0]
                 if cur_task_level_type not in logits:
@@ -466,107 +453,153 @@ class LucaGPLM(PreTrainedModel):
                 use_last_layer_norm=use_last_layer_norm
             )
             has_pair_b = True
-        if has_pair and has_pair_b and pair_output_keys and len(pair_output_keys) > 0:
-            cur_representation_vector = encoding["representation_vector"]
-            cur_representation_vector_b = encoding_b["representation_vector"]
-            pair_logits = {}
-            pair_outputs = {}
-            for item1 in pair_output_keys.items():
-                cur_task_level_type = item1[0]
-                if cur_task_level_type not in pair_outputs:
-                    pair_outputs[cur_task_level_type] = {}
-                    pair_logits[cur_task_level_type] = {}
-                for cur_task_level_name in item1[1]:
-                    cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](
-                        torch.cat((cur_representation_vector, cur_representation_vector_b), dim=-1)
-                    )
-                    cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
-                    if cur_hidden_layer is not None:
-                        cur_logits = cur_hidden_layer(cur_logits)
-                    cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
-                    pair_logits[cur_task_level_type][cur_task_level_name] = cur_logits
-                    pair_outputs[cur_task_level_type][cur_task_level_name] = self.output[cur_task_level_type][cur_task_level_name](cur_logits)
-            if pair_label is not None:
-                pair_loss = {}
                 for item1 in pair_output_keys.items():
                     cur_task_level_type = item1[0]
-                    if cur_task_level_type not in pair_label:
-                        continue
-                    if cur_task_level_type in pair_label:
-                        pair_loss[cur_task_level_type] = {}
                     for cur_task_level_name in item1[1]:
-                        if cur_task_level_name not in pair_label[cur_task_level_type]:
                             continue
-                        cur_label = pair_label[cur_task_level_type][cur_task_level_name]
-                        cur_label_size = self.label_size[cur_task_level_type][cur_task_level_name]
-                        cur_output_mode = self.output_mode[cur_task_level_type][cur_task_level_name]
-                        cur_loss_fct = self.loss_fct[cur_task_level_type][cur_task_level_name]
-                        cur_logits = pair_logits[cur_task_level_type][cur_task_level_name]
-                        cur_loss = self.__calc_loss__(
-                            task_level_type=cur_task_level_type,
-                            output_mode=cur_output_mode, logits=cur_logits,
-                            label=cur_label, label_size=cur_label_size, loss_fct=cur_loss_fct,
-                            loss_reduction="meanmean")
-                        pair_loss[cur_task_level_type][cur_task_level_name] = cur_loss
                 if not return_dict:
-                    return [[losses, losses_b, pair_loss], [outputs, outputs_b, pair_outputs]] + [[encoding, encoding_b]]
                 return AllOutput(
                     losses=losses,
                     outputs=outputs,
                     hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                     attentions=encoding["attentions"] if "attentions" in encoding else None,
                     global_attentions=None,
-                    contacts=encoding["contacts"] if "contacts" in encoding else None,
                     losses_b=losses_b,
                     outputs_b=outputs_b,
                     hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
-                    attentions_b=encoding_b["attentions"] if "hidden_states" in encoding_b else None,
                     global_attentions_b=None,
-                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None,
-                    pair_outputs=pair_outputs,
-                    pair_losses=pair_loss)
-            else:
                 if not return_dict:
-                    return [[losses, losses_b], [outputs, outputs_b]] + [[encoding, encoding_b]]
                 return AllOutput(
-                    losses=losses,
-                    outputs=outputs,
                     hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                     attentions=encoding["attentions"] if "attentions" in encoding else None,
                     global_attentions=None,
                     contacts=encoding["contacts"] if "contacts" in encoding else None,
-                    losses_b=losses_b,
-                    outputs_b=outputs_b,
                     hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
                     attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
                     global_attentions_b=None,
                     contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
                 )
-        elif has_pair:
-            if not return_dict:
-                return [[losses], [outputs], [encoding]]
-            return AllOutput(
-                losses=losses,
-                outputs=outputs,
-                hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
-                attentions=encoding["attentions"] if "attentions" in encoding else None,
-                global_attentions=None,
-                contacts=encoding["contacts"] if "contacts" in encoding else None
-            )
-        else:
-            if not return_dict:
-                return [[losses_b], [outputs_b], [encoding_b]]
-            return AllOutput(
-                losses_b=losses_b,
-                outputs_b=outputs_b,
-                hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
-                attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
-                global_attentions_b=None,
-                contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
-            )
     def predict_contacts(self, input_ids, position_ids=None, token_type_ids=None):
-        return self(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, return_contacts=True)["contacts"]

         self.use_embed_layer_norm = config.use_embed_layer_norm
         self.use_last_layer_norm = config.use_last_layer_norm
         self.embed_scale = config.embed_scale
+        self.embedding_inference = True
         self._init_submodules()
     def _init_submodules(self):
         )
         self.layer_size = len(self.layers)
+        if not self.embedding_inference:
+            self.contact_head = ContactPredictionHead(
+                self.num_layers * self.attention_heads,
+                self.prepend_bos,
+                self.append_eos,
+                eos_idx=self.eos_idx,
+                )
         if self.use_last_layer_norm:
             self.last_layer_norm = LucaGPLM1bLayerNorm(self.embed_dim)
         else:
             self.last_layer_norm = None
+        if not self.embedding_inference:
+            self.lm_head = RobertaLMHead(
+                embed_dim=self.embed_dim,
+                output_dim=self.alphabet_size,
+                weight=self.embed_tokens.weight,
+            )
     def _init_embedding(self, pretrained_token_matrix, token_matrix):
         '''
         31->38
         32->4
         '''
+        # print("Load pretrained exists embedding vectors:")
         token_matrix[2, :] = pretrained_token_matrix[0, :]
         token_matrix[0, :] = pretrained_token_matrix[1, :]
         token_matrix[3, :] = pretrained_token_matrix[2, :]
         return token_matrix
     def _init_submodules_new(self, pretrained_model_name):
+        # print("Load pretrained model exists weights:")
         from esm import pretrained
         from collections import OrderedDict
         pretrained, _ = pretrained.load_model_and_alphabet(pretrained_model_name)
             elif name in our_model_state_dict and our_model_state_dict[name].shape == weight.shape:
                 del our_model_state_dict[name]
                 new_state_dict[name] = weight
+        '''
         print("Exists layer names:")
         print(new_state_dict.keys())
         print("Not exists Layer names:")
         print(our_model_state_dict.keys())
+        '''
         new_state_dict.update(our_model_state_dict)
         self.load_state_dict(new_state_dict)
     def __calc_loss__(self, task_level_type, output_mode, logits, label, label_size, loss_fct, loss_reduction):
         if output_mode in ["regression"]:
             if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
                 # structure-level regression
         representation_matrix = hidden_representations[self.layer_size]
         # mask 任务
         # B * Seq_len * vocab_size
+        if not self.embedding_inference:
+            lm_mask_logits = self.lm_head(x)
         # lm head的输出向量作为表征向量
         # (B, E)
         representation_vector = representation_matrix[:, 0, :]
                 attentions = attentions * attention_mask[:, None, None, :, :]
             representations["attentions"] = attentions
             # 预测contact矩阵
+            if return_contacts and hasattr(self, "contact_head") \
+                    and not self.embedding_inference:
                 contacts = self.contact_head(input_ids, attentions)
                 representations["contacts"] = contacts
         '''
         print("output_keys:")
         print(output_keys)
         '''
+        if not self.embedding_inference and output_keys:
             for item in output_keys.items():
                 cur_task_level_type = item[0]
                 if cur_task_level_type not in logits:
                 use_last_layer_norm=use_last_layer_norm
             )
             has_pair_b = True
+        if not self.embedding_inference:
+            if has_pair and has_pair_b and pair_output_keys and len(pair_output_keys) > 0:
+                cur_representation_vector = encoding["representation_vector"]
+                cur_representation_vector_b = encoding_b["representation_vector"]
+                pair_logits = {}
+                pair_outputs = {}
                 for item1 in pair_output_keys.items():
                     cur_task_level_type = item1[0]
+                    if cur_task_level_type not in pair_outputs:
+                        pair_outputs[cur_task_level_type] = {}
+                        pair_logits[cur_task_level_type] = {}
                     for cur_task_level_name in item1[1]:
+                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](
+                            torch.cat((cur_representation_vector, cur_representation_vector_b), dim=-1)
+                        )
+                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
+                        if cur_hidden_layer is not None:
+                            cur_logits = cur_hidden_layer(cur_logits)
+                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
+                        pair_logits[cur_task_level_type][cur_task_level_name] = cur_logits
+                        pair_outputs[cur_task_level_type][cur_task_level_name] = self.output[cur_task_level_type][cur_task_level_name](cur_logits)
+                if pair_label is not None:
+                    pair_loss = {}
+                    for item1 in pair_output_keys.items():
+                        cur_task_level_type = item1[0]
+                        if cur_task_level_type not in pair_label:
                             continue
+                        if cur_task_level_type in pair_label:
+                            pair_loss[cur_task_level_type] = {}
+                        for cur_task_level_name in item1[1]:
+                            if cur_task_level_name not in pair_label[cur_task_level_type]:
+                                continue
+                            cur_label = pair_label[cur_task_level_type][cur_task_level_name]
+                            cur_label_size = self.label_size[cur_task_level_type][cur_task_level_name]
+                            cur_output_mode = self.output_mode[cur_task_level_type][cur_task_level_name]
+                            cur_loss_fct = self.loss_fct[cur_task_level_type][cur_task_level_name]
+                            cur_logits = pair_logits[cur_task_level_type][cur_task_level_name]
+                            cur_loss = self.__calc_loss__(
+                                task_level_type=cur_task_level_type,
+                                output_mode=cur_output_mode, logits=cur_logits,
+                                label=cur_label, label_size=cur_label_size, loss_fct=cur_loss_fct,
+                                loss_reduction="meanmean")
+                            pair_loss[cur_task_level_type][cur_task_level_name] = cur_loss
+                    if not return_dict:
+                        return [[losses, losses_b, pair_loss], [outputs, outputs_b, pair_outputs]] + [[encoding, encoding_b]]
+                    return AllOutput(
+                        losses=losses,
+                        outputs=outputs,
+                        hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
+                        attentions=encoding["attentions"] if "attentions" in encoding else None,
+                        global_attentions=None,
+                        contacts=encoding["contacts"] if "contacts" in encoding else None,
+                        losses_b=losses_b,
+                        outputs_b=outputs_b,
+                        hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
+                        attentions_b=encoding_b["attentions"] if "hidden_states" in encoding_b else None,
+                        global_attentions_b=None,
+                        contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None,
+                        pair_outputs=pair_outputs,
+                        pair_losses=pair_loss)
+                else:
+                    if not return_dict:
+                        return [[losses, losses_b], [outputs, outputs_b]] + [[encoding, encoding_b]]
+                    return AllOutput(
+                        losses=losses,
+                        outputs=outputs,
+                        hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
+                        attentions=encoding["attentions"] if "attentions" in encoding else None,
+                        global_attentions=None,
+                        contacts=encoding["contacts"] if "contacts" in encoding else None,
+                        losses_b=losses_b,
+                        outputs_b=outputs_b,
+                        hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
+                        attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
+                        global_attentions_b=None,
+                        contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
+                    )
+            elif has_pair:
                 if not return_dict:
+                    return [[losses], [outputs], [encoding]]
                 return AllOutput(
                     losses=losses,
                     outputs=outputs,
                     hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                     attentions=encoding["attentions"] if "attentions" in encoding else None,
                     global_attentions=None,
+                    contacts=encoding["contacts"] if "contacts" in encoding else None
+                )
+            else:
+                if not return_dict:
+                    return [[losses_b], [outputs_b], [encoding_b]]
+                return AllOutput(
                     losses_b=losses_b,
                     outputs_b=outputs_b,
                     hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
+                    attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
                     global_attentions_b=None,
+                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
+                )
+        else:
+            if has_pair and has_pair_b:
                 if not return_dict:
+                    return [[None, None], [None, None]] + [[encoding, encoding_b]]
                 return AllOutput(
+                    losses=None,
+                    outputs=None,
                     hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
                     attentions=encoding["attentions"] if "attentions" in encoding else None,
                     global_attentions=None,
                     contacts=encoding["contacts"] if "contacts" in encoding else None,
+                    losses_b=None,
+                    outputs_b=None,
+                    hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
+                    attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
+                    global_attentions_b=None,
+                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
+                )
+            elif has_pair:
+                if not return_dict:
+                    return [[None], [None], [encoding]]
+                return AllOutput(
+                    losses=None,
+                    outputs=None,
+                    hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
+                    attentions=encoding["attentions"] if "attentions" in encoding else None,
+                    global_attentions=None,
+                    contacts=encoding["contacts"] if "contacts" in encoding else None
+                )
+            else:
+                if not return_dict:
+                    return [[None], [None], [encoding_b]]
+                return AllOutput(
+                    losses_b=None,
+                    outputs_b=None,
                     hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
                     attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
                     global_attentions_b=None,
                     contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
                 )
     def predict_contacts(self, input_ids, position_ids=None, token_type_ids=None):
+        return self(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            return_contacts=True)["contacts"]

modeling_bert.py CHANGED Viewed

@@ -6,7 +6,7 @@
 @email: sanyuan.**@**.com
 @tel: 137****6540
 @datetime: 2022/12/2 09:38
-@project: LucaOneTasks
 @file: modeling_bert
 @desc: transformer layers
 '''
@@ -179,22 +179,20 @@ class BertEmbeddings(nn.Module):
     def __init__(self, config):
         super().__init__()
-        if hasattr(config, "no_token_embeddings"):
-            self.no_token_embeddings = config.no_token_embeddings
-        else:
-            self.no_token_embeddings = False
-        if not self.no_token_embeddings:
-            self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
         if hasattr(config, "no_position_embeddings"):
             self.no_position_embeddings = config.no_position_embeddings
         else:
             self.no_position_embeddings = False
         if hasattr(config, "no_token_type_embeddings"):
             self.no_token_type_embeddings = config.no_token_type_embeddings
         else:
             self.no_token_type_embeddings = False
         if not self.no_position_embeddings:
             self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         if not self.no_token_type_embeddings:
             self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
@@ -206,7 +204,10 @@ class BertEmbeddings(nn.Module):
         if not self.no_position_embeddings:
             self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
             self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
-        if not self.no_token_type_embeddings and not self.no_position_embeddings:
             if version.parse(torch.__version__) > version.parse("1.6.0"):
                 self.register_buffer(
                     "token_type_ids",
@@ -229,21 +230,20 @@ class BertEmbeddings(nn.Module):
         seq_length = input_shape[1]
-        if not self.no_position_embeddings and position_ids is None :
             position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
         # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
         # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
         # issue #5664
-        if not self.no_token_type_embeddings and token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device if input_ids is not None else inputs_embeds.device)
-        if self.no_token_embeddings and inputs_embeds is None:
-            raise Exception("The model has not token_embeddings layer, the inputs_embeds cannot None")
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
@@ -898,14 +898,11 @@ class BertModel(BertPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
-    def __init__(self, config, use_pretrained_embedding=False, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
-        self.use_pretrained_embedding = use_pretrained_embedding
-        self.add_pooling_layer = add_pooling_layer
-        self.embeddings = nn.Linear(config.embedding_input_size, config.hidden_size) if use_pretrained_embedding else BertEmbeddings(config)
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config) if add_pooling_layer else None
@@ -1029,16 +1026,13 @@ class BertModel(BertPreTrainedModel):
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-        if self.use_pretrained_embedding:
-            embedding_output = self.embeddings(inputs_embeds)
-        else:
-            embedding_output = self.embeddings(
-                input_ids=input_ids,
-                position_ids=position_ids,
-                token_type_ids=token_type_ids,
-                inputs_embeds=inputs_embeds,
-                past_key_values_length=past_key_values_length,
-            )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,

 @email: sanyuan.**@**.com
 @tel: 137****6540
 @datetime: 2022/12/2 09:38
+@project: LucaOne
 @file: modeling_bert
 @desc: transformer layers
 '''
     def __init__(self, config):
         super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
         if hasattr(config, "no_position_embeddings"):
             self.no_position_embeddings = config.no_position_embeddings
         else:
             self.no_position_embeddings = False
         if hasattr(config, "no_token_type_embeddings"):
             self.no_token_type_embeddings = config.no_token_type_embeddings
         else:
             self.no_token_type_embeddings = False
         if not self.no_position_embeddings:
             self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
         if not self.no_token_type_embeddings:
             self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
         if not self.no_position_embeddings:
             self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
             self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if not self.no_token_type_embeddings:
+            if not hasattr(self, "position_ids"):
+                self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
             if version.parse(torch.__version__) > version.parse("1.6.0"):
                 self.register_buffer(
                     "token_type_ids",
         seq_length = input_shape[1]
+        if (not self.no_position_embeddings or not self.no_token_type_embeddings) and position_ids is None:
             position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
         # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
         # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
         # issue #5664
+        if not self.no_token_type_embeddings:
+            if token_type_ids is None:
+                if hasattr(self, "token_type_ids"):
+                    buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                    token_type_ids = buffered_token_type_ids_expanded
+                else:
+                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
+    def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
+        self.embeddings = BertEmbeddings(config)
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config) if add_pooling_layer else None
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,

modeling_gplm.py CHANGED Viewed

@@ -1,6 +1,15 @@
 #!/usr/bin/env python
 # encoding: utf-8
 import math
 from typing import Dict, Optional, Sequence, Tuple, List, Union
 import uuid
@@ -11,19 +20,14 @@ from torch.nn import Parameter
 def gelu(x):
-    """Implementation of the gelu activation function.
-    OpenAI GPT's gelu: 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 def symmetrize(x):
-    "Make layer symmetric in final two dimensions, used for contact prediction."
     return x + x.transpose(-1, -2)
 def apc(x):
-    "Perform average product correct, used for contact prediction."
     a1 = x.sum(-1, keepdims=True)
     a2 = x.sum(-2, keepdims=True)
     a12 = x.sum((-1, -2), keepdims=True)
@@ -57,7 +61,22 @@ class LucaGPLM1LayerNorm(nn.Module):
             x = (self.weight * x) + self.bias
         return x
-from torch.nn import LayerNorm as LucaGPLM1bLayerNorm
 class LucaGPLMTransformerLayer(nn.Module):
@@ -141,7 +160,6 @@ class LucaGPLMTransformerLayer(nn.Module):
 class AxialTransformerLayer(nn.Module):
-    """Implements an Axial MSA Transformer block."""
     def __init__(
             self,
             embedding_dim: int = 768,
@@ -197,10 +215,6 @@ class AxialTransformerLayer(nn.Module):
             self_attn_padding_mask: Optional[torch.Tensor] = None,
             need_head_weights: bool = False,
     ):
-        """
-        LayerNorm is applied either before or after the self-attention/ffn
-        modules similar to the original Transformer implementation.
-        """
         x, row_attn = self.row_self_attention(
             x,
             self_attn_mask=self_attn_mask,
@@ -219,13 +233,6 @@ class AxialTransformerLayer(nn.Module):
 class LearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    Padding ids are ignored by either offsetting based on padding_idx
-    or by setting padding_idx to None and ensuring that the appropriate
-    position ids are passed to the forward function.
-    """
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         if padding_idx is not None:
             num_embeddings_ = num_embeddings + padding_idx + 1
@@ -293,8 +300,6 @@ class SinusoidalPositionalEmbedding(nn.Module):
 class RobertaLMHead(nn.Module):
-    """Head for masked language modeling."""
     def __init__(self, embed_dim, output_dim, weight):
         super().__init__()
         self.dense = nn.Linear(embed_dim, embed_dim)
@@ -312,8 +317,6 @@ class RobertaLMHead(nn.Module):
 class ContactPredictionHead(nn.Module):
-    """Performs symmetrization, apc, and computes a logistic regression on the output features"""
     def __init__(
             self,
             in_features: int,
@@ -697,11 +700,6 @@ def with_incremental_state(cls):
 @with_incremental_state
 class LucaGPLMMultiheadAttention(nn.Module):
-    """Multi-headed attention.
-    See "Attention Is All You Need" for more details.
-    """
     def __init__(
             self,
             embed_dim,
@@ -768,18 +766,6 @@ class LucaGPLMMultiheadAttention(nn.Module):
         self.onnx_trace = True
     def reset_parameters(self):
-        '''
-        if self.qkv_same_dim:
-            # Empirically observed the convergence to be much better with
-            # the scaled initialization
-            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
-            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
-            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
-        else:
-            nn.init.xavier_uniform_(self.k_proj.weight)
-            nn.init.xavier_uniform_(self.v_proj.weight)
-            nn.init.xavier_uniform_(self.q_proj.weight)
-        '''
         nn.init.xavier_uniform_(self.k_proj.weight, gain=nn.init.calculate_gain("relu"))
         nn.init.xavier_uniform_(self.v_proj.weight, gain=nn.init.calculate_gain("relu"))
         nn.init.xavier_uniform_(self.q_proj.weight, gain=nn.init.calculate_gain("relu"))
@@ -806,23 +792,6 @@ class LucaGPLMMultiheadAttention(nn.Module):
             before_softmax: bool = False,
             need_head_weights: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
-        """Input shape: Time x Batch x Channel
-        Args:
-            key_padding_mask (ByteTensor, optional): mask to exclude
-                keys that are pads, of shape `(batch, src_len)`, where
-                padding elements are indicated by 1s.
-            need_weights (bool, optional): return the attention weights,
-                averaged over heads (default: False).
-            attn_mask (ByteTensor, optional): typically used to
-                implement causal attention, where the mask prevents the
-                attention from looking forward in time (default: None).
-            before_softmax (bool, optional): return the raw attention
-                weights and values before the attention softmax.
-            need_head_weights (bool, optional): return the attention
-                weights for each head. Implies *need_weights*. Default:
-                return the average attention weights over all heads.
-        """
         if need_head_weights:
             need_weights = True
@@ -1081,7 +1050,6 @@ class LucaGPLMMultiheadAttention(nn.Module):
     def reorder_incremental_state(
             self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor
     ):
-        """Reorder buffered internal state (for incremental generation)."""
         input_buffer = self._get_input_buffer(incremental_state)
         if input_buffer is not None:
             for k in input_buffer.keys():
@@ -1121,7 +1089,6 @@ class LucaGPLMMultiheadAttention(nn.Module):
         keys_to_remove = []
         for k in state_dict.keys():
             if k.endswith(prefix + "in_proj_weight"):
-                # in_proj_weight used to be q + k + v with same dimensions
                 dim = int(state_dict[k].shape[0] / 3)
                 items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
                 items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
@@ -1158,22 +1125,8 @@ def apply_rotary_pos_emb(x, cos, sin):
 class RotaryEmbedding(torch.nn.Module):
-    """
-    The rotary position embeddings from RoFormer_ (Su et. al).
-    A crucial insight from the method is that the query and keys are
-    transformed by rotation matrices which depend on the relative positions.
-    Other implementations are available in the Rotary Transformer repo_ and in
-    GPT-NeoX_, GPT-NeoX was an inspiration
-    .. _RoFormer: https://arxiv.org/abs/2104.09864
-    .. _repo: https://github.com/ZhuiyiTechnology/roformer
-    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
-    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
-        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
-    """
     def __init__(self, dim: int, *_, **__):
         super().__init__()
-        # Generate and save the inverse frequency buffer (non trainable)
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("inv_freq", inv_freq)
@@ -1184,8 +1137,6 @@ class RotaryEmbedding(torch.nn.Module):
     def _update_cos_sin_tables(self, x, seq_dimension=1):
         seq_len = x.shape[seq_dimension]
-        # Reset the tables if the sequence length has changed,
-        # or if we're on a new device (possibly due to tracing for instance)
         if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
             self._seq_len_cached = seq_len
             t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)

 #!/usr/bin/env python
 # encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: [email protected]
+@tel: 137****6540
+@datetime: 2023/7/24 10:01
+@project: LucaOne
+@file: modeling_gplm
+@desc: LucaOne Model Detail
+'''
 import math
 from typing import Dict, Optional, Sequence, Tuple, List, Union
 import uuid
 def gelu(x):
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 def symmetrize(x):
     return x + x.transpose(-1, -2)
 def apc(x):
     a1 = x.sum(-1, keepdims=True)
     a2 = x.sum(-2, keepdims=True)
     a12 = x.sum((-1, -2), keepdims=True)
             x = (self.weight * x) + self.bias
         return x
+try:
+    # Optimized LayerNorm
+    from apex.normalization import FusedLayerNorm as _FusedLayerNorm
+    class LucaGPLM1bLayerNorm(_FusedLayerNorm):
+        @torch.jit.unused
+        def forward(self, x):
+            if not x.is_cuda:
+                return super().forward(x)
+            else:
+                with torch.cuda.device(x.device):
+                    return super().forward(x)
+except ImportError as e:
+    print("import apex err:", e)
+    from torch.nn import LayerNorm as LucaGPLM1bLayerNorm
 class LucaGPLMTransformerLayer(nn.Module):
 class AxialTransformerLayer(nn.Module):
     def __init__(
             self,
             embedding_dim: int = 768,
             self_attn_padding_mask: Optional[torch.Tensor] = None,
             need_head_weights: bool = False,
     ):
         x, row_attn = self.row_self_attention(
             x,
             self_attn_mask=self_attn_mask,
 class LearnedPositionalEmbedding(nn.Embedding):
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         if padding_idx is not None:
             num_embeddings_ = num_embeddings + padding_idx + 1
 class RobertaLMHead(nn.Module):
     def __init__(self, embed_dim, output_dim, weight):
         super().__init__()
         self.dense = nn.Linear(embed_dim, embed_dim)
 class ContactPredictionHead(nn.Module):
     def __init__(
             self,
             in_features: int,
 @with_incremental_state
 class LucaGPLMMultiheadAttention(nn.Module):
     def __init__(
             self,
             embed_dim,
         self.onnx_trace = True
     def reset_parameters(self):
         nn.init.xavier_uniform_(self.k_proj.weight, gain=nn.init.calculate_gain("relu"))
         nn.init.xavier_uniform_(self.v_proj.weight, gain=nn.init.calculate_gain("relu"))
         nn.init.xavier_uniform_(self.q_proj.weight, gain=nn.init.calculate_gain("relu"))
             before_softmax: bool = False,
             need_head_weights: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         if need_head_weights:
             need_weights = True
     def reorder_incremental_state(
             self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor
     ):
         input_buffer = self._get_input_buffer(incremental_state)
         if input_buffer is not None:
             for k in input_buffer.keys():
         keys_to_remove = []
         for k in state_dict.keys():
             if k.endswith(prefix + "in_proj_weight"):
                 dim = int(state_dict[k].shape[0] / 3)
                 items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
                 items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self, dim: int, *_, **__):
         super().__init__()
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("inv_freq", inv_freq)
     def _update_cos_sin_tables(self, x, seq_dimension=1):
         seq_len = x.shape[seq_dimension]
         if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
             self._seq_len_cached = seq_len
             t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)