Yuanfei commited on Oct 25, 2024

Commit

96c0ca2

verified ·

1 Parent(s): 729354d

Upload LucaGPLM

Browse files

Files changed (19) hide show

alphabet.py +164 -0
alphabet_atom.py +132 -0
batch_converter.py +1365 -0
classification_loss.py +296 -0
config.json +71 -0
file_operator.py +230 -0
loss.py +224 -0
lucaone_gplm.py +572 -0
lucaone_gplm_config.py +49 -0
masked_loss.py +159 -0
metrics.py +549 -0
model_utils.py +99 -0
modeling_bert.py +1917 -0
modeling_gplm.py +1225 -0
multi_label_metrics.py +536 -0
pooling.py +301 -0
pytorch_model.bin +3 -0
regression_loss.py +238 -0
utils.py +979 -0

alphabet.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import sys
+import itertools
+from typing import Sequence, List
+from .batch_converter import BatchConverter
+gene_standard_toks = ['1', '2', '3', '4', '5', '.', '-', '*']
+prot_standard_toks = ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', 'J', '.', '-', '*']
+gene_prot_standard_toks = ['1', '2', '3', '4', '5', 'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', 'J', '.', '-', '*']
+gene_prot_prepend_toks = ['[PAD]', '[UNK]']
+gene_prot_append_toks = ['[CLS]', '[SEP]', '[MASK]']
+class Alphabet(object):
+    def __init__(
+            self,
+            standard_toks: Sequence[str],
+            prepend_toks: Sequence[str] = gene_prot_prepend_toks,
+            append_toks: Sequence[str] = gene_prot_append_toks,
+            prepend_bos: bool = True,
+            append_eos: bool = True
+    ):
+        self.standard_toks = list(standard_toks)
+        self.prepend_toks = list(prepend_toks)
+        self.append_toks = list(append_toks)
+        self.prepend_bos = prepend_bos
+        self.append_eos = append_eos
+        self.all_toks = list(self.prepend_toks)
+        self.all_toks.extend(self.append_toks)
+        self.all_toks.extend(self.standard_toks)
+        self.tok_to_idx = {tok: i for i, tok in enumerate(self.all_toks)}
+        self.unk_idx = self.tok_to_idx["[UNK]"]
+        self.padding_idx = self.get_idx("[PAD]")
+        self.pad_token_id = self.padding_idx
+        self.cls_idx = self.get_idx("[CLS]")
+        self.mask_idx = self.get_idx("[MASK]")
+        self.eos_idx = self.get_idx("[SEP]")
+        self.all_special_tokens = prepend_toks + append_toks
+        self.all_special_token_idx_list = [self.tok_to_idx[v] for v in self.all_special_tokens]
+        self.unique_no_split_tokens = self.all_toks
+        self.vocab_size = self.__len__()
+    def __len__(self):
+        return len(self.all_toks)
+    def get_idx(self, tok):
+        return self.tok_to_idx.get(tok, self.unk_idx)
+    def get_tok(self, ind):
+        return self.all_toks[ind]
+    def to_dict(self):
+        return self.tok_to_idx.copy()
+    def get_batch_converter(self, no_position_embeddings, no_token_type_embeddings, truncation_seq_length: int = None, ignore_index: int = -100, mlm_probability=0.15):
+        return BatchConverter(self,
+                              no_position_embeddings=no_position_embeddings,
+                              no_token_type_embeddings=no_token_type_embeddings,
+                              truncation_seq_length=truncation_seq_length,
+                              ignore_index=ignore_index,
+                              mlm_probability=mlm_probability)
+    @classmethod
+    def from_predefined(cls, name: str):
+        if name.lower() == "prot":
+            standard_toks = prot_standard_toks
+        elif name.lower() == "gene":
+            standard_toks = gene_standard_toks
+        elif name.lower() in ["gene_prot", "prot_gene"]:
+            standard_toks = gene_prot_standard_toks
+        else:
+            raise Exception("Not support tokenizer name: %s" % name)
+        prepend_toks = gene_prot_prepend_toks
+        append_toks = gene_prot_append_toks
+        prepend_bos = True
+        append_eos = True
+        return cls(standard_toks, prepend_toks, append_toks, prepend_bos, append_eos)
+    @classmethod
+    def from_pretrained(cls, dir_path):
+        import os, pickle
+        return pickle.load(open(os.path.join(dir_path, "alphabet.pkl"), "rb"))
+    def save_pretrained(self, save_dir):
+        import os, pickle
+        with open(os.path.join(save_dir, "alphabet.pkl"), 'wb') as outp:
+            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)
+    def _tokenize(self, text) -> str:
+        return text.split()
+    def tokenize(self, text, **kwargs) -> List[str]:
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                if i < len(split_text) - 1:
+                    sub_text = sub_text.rstrip()
+                if i > 0:
+                    sub_text = sub_text.lstrip()
+                if i == 0 and not sub_text:
+                    result.append(tok)
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result.append(sub_text)
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result.append(sub_text)
+                    result.append(tok)
+            return result
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.unique_no_split_tokens:
+                        tokenized_text.extend(split_on_token(tok, sub_text))
+                    else:
+                        tokenized_text.append(sub_text)
+                text_list = tokenized_text
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token)
+                        if token not in self.unique_no_split_tokens
+                        else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
+        no_split_token = self.unique_no_split_tokens
+        tokenized_text = split_on_tokens(no_split_token, text)
+        return tokenized_text
+    def encode(self, text):
+        return [self.tok_to_idx[tok] for tok in self.tokenize(text)]
+if __name__ == "__main__":
+    alphabet = Alphabet.from_predefined("gene_prot")
+    from src.utils import gene_seq_replace
+    print(alphabet.encode(gene_seq_replace("gttgtttggtagctaggagcctgactacatggcttcaaggctaaatggccacaggtgcccaggctatttggcttgctggaggcttcattcat")))

alphabet_atom.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python
+# encoding: utf-8
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from typing import Sequence, List
+atom_standard_toks = ['C', 'N', 'O', 'S', 'H', 'Cl', 'F', 'Br', 'I',
+                      'Si', 'P', 'B', 'Na', 'K', 'Al', 'Ca', 'Sn', 'As',
+                      'Hg', 'Fe', 'Zn', 'Cr', 'Se', 'Gd', 'Au', 'Li'
+                      ]
+atom_prepend_toks = ['[PAD]', '[UNK]', '[CLS]']
+atom_append_toks = ['[SEP]', '[MASK]']
+class AlphabetAtom(object):
+    def __init__(
+            self,
+            standard_toks: Sequence[str] = atom_standard_toks,
+            prepend_toks: Sequence[str] = atom_prepend_toks,
+            append_toks: Sequence[str] = atom_append_toks,
+            prepend_bos: bool = True,
+            append_eos: bool = True
+    ):
+        self.standard_toks = list(standard_toks)
+        self.prepend_toks = list(prepend_toks)
+        self.append_toks = list(append_toks)
+        self.prepend_bos = prepend_bos
+        self.append_eos = append_eos
+        self.all_toks = list(self.prepend_toks)
+        self.all_toks.extend(self.append_toks)
+        self.all_toks.extend(self.standard_toks)
+        self.tok_to_idx = {tok: i for i, tok in enumerate(self.all_toks)}
+        self.unk_idx = self.tok_to_idx["[UNK]"]
+        self.padding_idx = self.get_idx("[PAD]")
+        self.pad_idx = self.get_idx("[PAD]")
+        self.pad_token_id = self.padding_idx
+        self.cls_idx = self.get_idx("[CLS]")
+        self.mask_idx = self.get_idx("[MASK]")
+        self.eos_idx = self.get_idx("[SEP]")
+        self.all_special_tokens = prepend_toks + append_toks
+        self.all_special_token_idx_list = [self.tok_to_idx[v] for v in self.all_special_tokens]
+        self.unique_no_split_tokens = self.all_toks
+        self.vocab_size = self.__len__()
+    def __len__(self):
+        return len(self.all_toks)
+    def get_idx(self, tok):
+        return self.tok_to_idx.get(tok, self.unk_idx)
+    def get_tok(self, ind):
+        return self.all_toks[ind]
+    def to_dict(self):
+        return self.tok_to_idx.copy()
+    def get_batch_converter(self, task_level_type, label_size, output_mode, no_position_embeddings,
+                            no_token_type_embeddings, truncation_seq_length: int = None, ignore_index: int = -100, mlm_probability=0.15):
+        '''
+        return BatchConverter(
+            task_level_type,
+            label_size,
+            output_mode,
+            seq_subword=False,
+            seq_tokenizer=self,
+            no_position_embeddings=no_position_embeddings,
+            no_token_type_embeddings=no_token_type_embeddings,
+            truncation_seq_length=truncation_seq_length,
+            truncation_matrix_length=truncation_seq_length,
+            ignore_index=ignore_index,
+            mlm_probability=mlm_probability,
+            prepend_bos=self.prepend_bos,
+            append_eos=self.append_eos)
+        '''
+        pass
+    @classmethod
+    def smiles_2_atom_seq(cls, smi):
+        mol = Chem.MolFromSmiles(smi)
+        mol = AllChem.AddHs(mol)
+        atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]  # after add H
+        return atoms
+    @classmethod
+    def from_predefined(cls, name: str = "atom_v1"):
+        if name.lower() == "atom_v1":
+            standard_toks = atom_standard_toks
+        else:
+            raise Exception("Not support tokenizer name: %s" % name)
+        prepend_toks = atom_prepend_toks
+        append_toks = atom_append_toks
+        prepend_bos = True
+        append_eos = True
+        return cls(standard_toks, prepend_toks, append_toks, prepend_bos, append_eos)
+    @classmethod
+    def from_pretrained(cls, dir_path):
+        import os, pickle
+        return pickle.load(open(os.path.join(dir_path, "alphabet_atom.pkl"), "rb"))
+    def save_pretrained(self, save_dir):
+        import os, pickle
+        with open(os.path.join(save_dir, "alphabet_atom.pkl"), 'wb') as outp:
+            pickle.dump(self, outp, pickle.HIGHEST_PROTOCOL)
+    def tokenize(self, smi, prepend_bos, append_eos) -> List[str]:
+        seq = AlphabetAtom.smiles_2_atom_seq(smi)
+        if prepend_bos:
+            seq = [self.get_tok(self.cls_idx)] + seq
+        if append_eos:
+            seq = seq + [self.get_tok(self.eos_idx)]
+        return seq
+    def encode(self, atom_list, prepend_bos, append_eos):
+        idx_list = [self.get_idx(tok) for tok in atom_list]
+        if prepend_bos:
+            idx_list = [self.cls_idx] + idx_list
+        if append_eos:
+            idx_list = idx_list + [self.eos_idx]
+        return idx_list
+    def encode_smi(self, smi, prepend_bos, append_eos):
+        atom_list = self.smiles_2_atom_seq(smi)
+        return self.encode(atom_list, prepend_bos, append_eos)

batch_converter.py ADDED Viewed

	@@ -0,0 +1,1365 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import sys
+import torch
+from typing import Sequence
+from .alphabet_atom import AlphabetAtom
+from .utils import gene_seq_replace
+class BatchConverter(object):
+    def __init__(self,
+                 task_level_type,
+                 label_size,
+                 output_mode,
+                 seq_subword,
+                 seq_tokenizer,
+                 no_position_embeddings,
+                 no_token_type_embeddings,
+                 truncation_seq_length: int = None,
+                 truncation_matrix_length: int = None,
+                 atom_tokenizer: AlphabetAtom = None,
+                 atom_truncation_seq_length: int = None,
+                 atom_truncation_matrix_length: int = None,
+                 ignore_index: int = -100,
+                 padding_idx: int = 0,
+                 unk_idx: int = 1,
+                 cls_idx: int = 2,
+                 eos_idx: int = 3,
+                 mask_idx: int = 4,
+                 non_ignore: bool = False,
+                 mlm_probability=0.15,
+                 prepend_bos=None,
+                 append_eos=None,
+                 **kwargs):
+        print("------BatchConverter------")
+        print("BatchConverter, kwargs:")
+        print(kwargs)
+        self.task_level_type = task_level_type
+        self.label_size = label_size
+        self.output_mode = output_mode
+        self.seq_tokenizer = seq_tokenizer
+        self.seq_subword = seq_subword
+        self.ignore_index = ignore_index
+        self.non_ignore = non_ignore
+        self.mlm_probability = mlm_probability
+        self.truncation_seq_length = truncation_seq_length
+        self.truncation_matrix_length = truncation_matrix_length
+        # subword 则必包含两个特殊字符
+        if prepend_bos is None:
+            if seq_subword is not None:
+                self.prepend_bos = True
+            else:
+                self.prepend_bos = False
+        else:
+            self.prepend_bos = prepend_bos
+        if append_eos is None:
+            if seq_subword is not None:
+                self.append_eos = True
+            else:
+                self.append_eos = False
+        else:
+            self.append_eos = append_eos
+        self.padding_idx = padding_idx
+        self.unk_idx = unk_idx
+        self.cls_idx = cls_idx
+        self.eos_idx = eos_idx
+        self.mask_idx = mask_idx
+        if self.seq_tokenizer is None:
+            self.append_len = 0
+        else:
+            if hasattr(seq_tokenizer, "prepend_bos"):
+                self.prepend_bos = self.seq_tokenizer.prepend_bos
+            if hasattr(seq_tokenizer, "append_eos"):
+                self.append_eos = self.seq_tokenizer.append_eos
+            if hasattr(seq_tokenizer, "padding_idx"):
+                self.padding_idx = self.seq_tokenizer.padding_idx
+            if hasattr(seq_tokenizer, "unk_idx"):
+                self.unk_idx = self.seq_tokenizer.unk_idx
+            if hasattr(seq_tokenizer, "cls_idx"):
+                self.cls_idx = self.seq_tokenizer.cls_idx
+            if hasattr(seq_tokenizer, "eos_idx"):
+                self.eos_idx = self.seq_tokenizer.eos_idx
+            if hasattr(seq_tokenizer, "mask_idx"):
+                self.mask_idx = self.seq_tokenizer.mask_idx
+            if hasattr(seq_tokenizer, "all_special_token_idx_list"):
+                self.all_special_token_idx_list = self.seq_tokenizer.all_special_token_idx_list
+            else:
+                self.all_special_token_idx_list = [self.padding_idx, self.unk_idx, self.cls_idx, self.eos_idx, self.mask_idx]
+            self.append_len = int(self.prepend_bos) + int(self.append_eos)
+        # for atom
+        self.atom_tokenizer = atom_tokenizer
+        self.atom_truncation_seq_length = atom_truncation_seq_length
+        self.atom_truncation_matrix_length = atom_truncation_matrix_length
+        self.atom_prepend_bos = False
+        self.atom_append_eos = False
+        self.atom_padding_idx = padding_idx
+        self.atom_unk_idx = unk_idx
+        self.atom_cls_idx = cls_idx
+        self.atom_eos_idx = eos_idx
+        self.atom_mask_idx = mask_idx
+        if self.atom_tokenizer is None:
+            self.atom_append_len = 0
+        else:
+            if hasattr(seq_tokenizer, "padding_idx"):
+                self.padding_idx = self.seq_tokenizer.padding_idx
+            elif hasattr(seq_tokenizer, "pad_idx"):
+                self.padding_idx = self.seq_tokenizer.pad_idx
+            elif hasattr(seq_tokenizer, "pad_token_id"):
+                self.padding_idx = self.seq_tokenizer.pad_token_id
+            if hasattr(seq_tokenizer, "unk_idx"):
+                self.unk_idx = self.seq_tokenizer.unk_idx
+            elif hasattr(seq_tokenizer, "unk_token_id"):
+                self.unk_idx = self.seq_tokenizer.unk_token_id
+            if hasattr(seq_tokenizer, "cls_idx"):
+                self.cls_idx = self.seq_tokenizer.cls_idx
+            elif hasattr(seq_tokenizer, "cls_token_id"):
+                self.cls_idx = self.seq_tokenizer.cls_token_id
+            elif hasattr(seq_tokenizer, "bos_idx"):
+                self.cls_idx = self.seq_tokenizer.bos_idx
+            elif hasattr(seq_tokenizer, "bos_token_id"):
+                self.cls_idx = self.seq_tokenizer.bos_token_id
+            if hasattr(seq_tokenizer, "eos_idx"):
+                self.eos_idx = self.seq_tokenizer.eos_idx
+            elif hasattr(seq_tokenizer, "eos_token_id"):
+                self.eos_idx = self.seq_tokenizer.eos_token_id
+            elif hasattr(seq_tokenizer, "sep_token_id"):
+                self.eos_idx = self.seq_tokenizer.sep_token_id
+            if hasattr(seq_tokenizer, "mask_idx"):
+                self.mask_idx = self.seq_tokenizer.mask_idx
+            elif hasattr(seq_tokenizer, "mask_token_id"):
+                self.mask_idx = self.seq_tokenizer.mask_token_id
+            if hasattr(atom_tokenizer, "all_special_token_idx_list"):
+                self.atom_all_special_token_idx_list = self.atom_tokenizer.all_special_token_idx_list
+            else:
+                self.atom_all_special_token_idx_list = [self.padding_idx, self.unk_idx, self.cls_idx, self.eos_idx, self.mask_idx]
+            self.atom_append_len = int(self.atom_prepend_bos) + int(self.atom_append_eos)
+        print("BatchConverter: prepend_bos=%r, append_eos=%r" % (self.prepend_bos, self.append_eos))
+        print("BatchConverter: atom_prepend_bos=%r, atom_append_eos=%r" % (self.atom_prepend_bos, self.atom_append_eos))
+        self.matrix_add_special_token = False
+        if "matrix_add_special_token" in kwargs and kwargs["matrix_add_special_token"]:
+            self.matrix_add_special_token = kwargs["matrix_add_special_token"]
+        if self.matrix_add_special_token:
+            self.prepend_bos = True
+            self.append_eos = True
+            self.atom_prepend_bos = True
+            self.atom_append_eos = True
+            self.append_len = int(self.prepend_bos) + int(self.append_eos)
+            self.atom_append_len = int(self.atom_prepend_bos) + int(self.atom_append_eos)
+        # 减去特殊字符之后的长度
+        self.truncation_seq_length -= self.append_len
+        self.truncation_matrix_length -= self.append_len
+        # 减去特殊字符之后的长度
+        if self.atom_truncation_seq_length:
+            self.atom_truncation_seq_length -= self.atom_append_len
+        if self.atom_truncation_matrix_length:
+            self.atom_truncation_matrix_length -= self.atom_append_len
+        self.input_type = None
+        if "input_type" in kwargs and kwargs["input_type"]:
+            self.input_type = kwargs["input_type"]
+        if "max_sentence_length" in kwargs and kwargs["max_sentence_length"]:
+            self.max_sentence_length = kwargs["max_sentence_length"] - self.append_len
+            print("BatchConverter: self.max_sentence_length=%d" % self.max_sentence_length)
+            if atom_tokenizer is not None:
+                self.atom_max_sentence_length = kwargs["max_sentence_length"] - self.atom_append_len
+                print("BatchConverter: self.atom_max_sentence_length=%d" % self.atom_max_sentence_length)
+        if "max_sentences" in kwargs and kwargs["max_sentences"]:
+            self.max_sentences = kwargs["max_sentences"]
+            print("BatchConverter: self.max_sentences=%d" % self.max_sentences)
+        self.trunc_type = "right"
+        if "trunc_type" in kwargs and kwargs["trunc_type"]:
+            self.trunc_type = kwargs["trunc_type"]
+            print("BatchConverter: self.trunc_type=%s" % self.trunc_type)
+        self.no_position_embeddings = no_position_embeddings
+        self.no_token_type_embeddings = no_token_type_embeddings
+        print("BatchConverter: prepend_bos=%r, append_eos=%r" % (self.prepend_bos, self.append_eos))
+        print("BatchConverter: atom_prepend_bos=%r, atom_append_eos=%r" % (self.atom_prepend_bos, self.atom_append_eos))
+        print("-" * 50)
+    def __parse_label__(self, max_length, task_level_type, label_size, output_mode, label):
+        if isinstance(label, str):
+            label = eval(label)
+        '''
+        print("label:")
+        print(label)
+        '''
+        # 需要是padding长度
+        cur_len = max_length
+        if task_level_type in ["token_level", "structure_level"]:
+            if output_mode in ["multi_label", "multi-label"]:
+                # N * seq_len * label_size
+                new_label = []
+                for _ in range(cur_len):
+                    tmp = []
+                    for _ in range(label_size):
+                        tmp.append(0 if self.non_ignore else self.ignore_index)
+                    new_label.append(tmp)
+            else:
+                # N * seq_len
+                new_label = []
+                for _ in range(cur_len):
+                    new_label.append(0 if self.non_ignore else self.ignore_index)
+            if label is not None and len(label) > 0:
+                begin_idx = 0
+                end_idx = cur_len
+                if self.prepend_bos:
+                    begin_idx = 1
+                if self.append_eos:
+                    end_idx = cur_len - 1
+                for idx, item in enumerate(label):
+                    idx += begin_idx
+                    if idx >= end_idx:
+                        break
+                    if output_mode in ["multi_label", "multi-label"]:
+                        for v in item:
+                            new_label[idx][v] = 1
+                    else:
+                        new_label[idx] = item
+        elif task_level_type == "span_level":
+            if output_mode in ["multi_label", "multi-label"]:
+                # N * seq_len * label_size
+                new_label = []
+                for _ in range(cur_len):
+                    tmp = []
+                    for _ in range(label_size):
+                        tmp.append(0 if self.non_ignore else self.ignore_index)
+                    new_label.append(tmp)
+            else:
+                # N * seq_len
+                new_label = []
+                for _ in range(cur_len):
+                    new_label.append(0 if self.non_ignore else self.ignore_index)
+            if label is not None and len(label) > 0:
+                begin_idx = 0
+                end_idx = cur_len
+                if self.prepend_bos:
+                    begin_idx = 1
+                if self.append_eos:
+                    end_idx = cur_len - 1
+                for item in label:
+                    for idx in range(item[0], item[1] + 1, 1):
+                        idx += begin_idx
+                        if idx >= end_idx:
+                            break
+                        if output_mode in ["multi_label", "multi-label"]:
+                            new_label[idx][item[2]] = 1
+                        else:
+                            new_label[idx] = item[2]
+        elif task_level_type in ["seq_level"]:
+            if output_mode in ["multi_label", "multi-label"]:
+                # N * label_size
+                new_label = []
+                for _ in range(label_size):
+                    new_label.append(0 if self.non_ignore else self.ignore_index)
+            else:
+                # N * 1
+                new_label = [0 if self.non_ignore else self.ignore_index]
+            if output_mode in ["multi_label", "multi-label"]:
+                if label is not None and len(label) > 0:
+                    for v in label:
+                        new_label[int(v)] = 1
+            else:
+                if label is not None and len(str(label)) > 0:
+                    if isinstance(label, str):
+                        new_label = [int(label)]
+                    elif isinstance(label, list):
+                        new_label = [int(label[0])]
+                    else:
+                        new_label = [label]
+        else:
+            raise Exception("Not support task_level_type=%s" % task_level_type)
+        return new_label
+    def __atom_parse_label__(self, max_length, task_level_type, label_size, output_mode, label):
+        if isinstance(label, str):
+            label = eval(label)
+        '''
+        print("label:")
+        print(label)
+        '''
+        # 需要是padding长度
+        cur_len = max_length
+        if task_level_type in ["token_level", "structure_level"]:
+            if output_mode in ["multi_label", "multi-label"]:
+                # N * seq_len * label_size
+                new_label = []
+                for _ in range(cur_len):
+                    tmp = []
+                    for _ in range(label_size):
+                        tmp.append(0 if self.non_ignore else self.ignore_index)
+                    new_label.append(tmp)
+            else:
+                # N * seq_len
+                new_label = []
+                for _ in range(cur_len):
+                    new_label.append(0 if self.non_ignore else self.ignore_index)
+            if label is not None and len(label) > 0:
+                begin_idx = 0
+                end_idx = cur_len
+                if self.atom_prepend_bos:
+                    begin_idx = 1
+                if self.atom_append_eos:
+                    end_idx = cur_len - 1
+                for idx, item in enumerate(label):
+                    idx += begin_idx
+                    if idx >= end_idx:
+                        break
+                    if output_mode in ["multi_label", "multi-label"]:
+                        for v in item:
+                            new_label[idx][v] = 1
+                    else:
+                        new_label[idx] = item
+        elif task_level_type == "span_level":
+            if output_mode in ["multi_label", "multi-label"]:
+                # N * seq_len * label_size
+                new_label = []
+                for _ in range(cur_len):
+                    tmp = []
+                    for _ in range(label_size):
+                        tmp.append(0 if self.non_ignore else self.ignore_index)
+                    new_label.append(tmp)
+            else:
+                # N * seq_len
+                new_label = []
+                for _ in range(cur_len):
+                    new_label.append(0 if self.non_ignore else self.ignore_index)
+            if label is not None and len(label) > 0:
+                begin_idx = 0
+                end_idx = cur_len
+                if self.atom_prepend_bos:
+                    begin_idx = 1
+                if self.atom_append_eos:
+                    end_idx = cur_len - 1
+                for item in label:
+                    for idx in range(item[0], item[1] + 1, 1):
+                        idx += begin_idx
+                        if idx >= end_idx:
+                            break
+                        if output_mode in ["multi_label", "multi-label"]:
+                            new_label[idx][item[2]] = 1
+                        else:
+                            new_label[idx] = item[2]
+        elif task_level_type in ["seq_level"]:
+            if output_mode in ["multi_label", "multi-label"]:
+                # N * label_size
+                new_label = []
+                for _ in range(label_size):
+                    new_label.append(0 if self.non_ignore else self.ignore_index)
+            else:
+                # N * 1
+                new_label = [0 if self.non_ignore else self.ignore_index]
+            if output_mode in ["multi_label", "multi-label"]:
+                if label is not None and len(label) > 0:
+                    for v in label:
+                        new_label[int(v)] = 1
+            else:
+                if label is not None and len(str(label)) > 0:
+                    if isinstance(label, str):
+                        new_label = [int(label)]
+                    elif isinstance(label, list):
+                        new_label = [int(label[0])]
+                    else:
+                        new_label = [label]
+        else:
+            raise Exception("Not support task_level_type=%s" % task_level_type)
+        return new_label
+    def __mask_tokens__(self, input_ids):
+        labels = input_ids.clone()
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        # 特殊字符处为1
+        special_tokens_mask = [
+            1 if v in self.all_special_token_idx_list else 0 for v in labels.tolist()
+        ]
+        special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        # 将特殊字符处填充为0.0
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        # 非特殊字符的位置
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        # 特殊字符处为-100
+        labels[~masked_indices] = self.ignore_index  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with alphabet.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        input_ids[indices_replaced] = self.mask_idx
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.seq_tokenizer), labels.shape, dtype=torch.long)
+        input_ids[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return input_ids, labels
+    def __atom_mask_tokens__(self, input_ids):
+        labels = input_ids.clone()
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        # 特殊字符处为1
+        special_tokens_mask = [
+            1 if v in self.atom_all_special_token_idx_list else 0 for v in labels.tolist()
+        ]
+        special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        # 将特殊字符处填充为0.0
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        # 非特殊字符的位置
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        # 特殊字符处为-100
+        labels[~masked_indices] = self.ignore_index  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with alphabet.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        input_ids[indices_replaced] = self.atom_mask_idx
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.atom_tokenizer), labels.shape, dtype=torch.long)
+        input_ids[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return input_ids, labels
+    def __seq_encode__(self, batch_size, seqs):
+        '''
+        该函数不加特殊字符[CLS]与[SEP]
+        :param batch_size:
+        :param seqs:
+        :return:
+        '''
+        if self.seq_subword:
+            seq_encoded_list = []
+            for seq_str in seqs:
+                seq_to_list = self.seq_subword.process_line(seq_str.upper()).split(" ")
+                seq = " ".join(seq_to_list)
+                inputs = self.seq_tokenizer.encode_plus(
+                    seq,
+                    None,
+                    add_special_tokens=False,
+                    max_length=self.truncation_seq_length,
+                    truncation=True
+                )
+                seq_encoded_list.append(inputs["input_ids"])
+        else:
+            seq_encoded_list = [self.seq_tokenizer.encode(seq_str.upper()) for seq_str in seqs]
+            # 该长度已经减去了需要增加的特殊字符的个数
+            if self.truncation_seq_length:
+                seq_encoded_list = [encoded[:self.truncation_seq_length] for encoded in seq_encoded_list]
+        max_len = max(len(seq_encoded) for seq_encoded in seq_encoded_list)
+        max_len = max_len + int(self.prepend_bos) + int(self.append_eos)
+        # for input
+        input_ids = torch.empty(
+            (
+                batch_size,
+                max_len,
+            ),
+            dtype=torch.int64,
+        )
+        input_ids.fill_(self.padding_idx)
+        position_ids = None
+        if not self.no_position_embeddings:
+            position_ids = torch.empty(
+                (
+                    batch_size,
+                    max_len,
+                ),
+                dtype=torch.int64,
+            )
+            position_ids.fill_(self.padding_idx)
+        token_type_ids = None
+        if not self.no_position_embeddings:
+            token_type_ids = torch.empty(
+                (
+                    batch_size,
+                    max_len,
+                ),
+                dtype=torch.int64,
+            )
+            token_type_ids.fill_(self.padding_idx)
+        attention_masks = torch.empty(
+            (
+                batch_size,
+                max_len,
+            ),
+            dtype=torch.int64,
+        )
+        attention_masks.fill_(0)
+        return seq_encoded_list, input_ids, position_ids, token_type_ids, attention_masks, max_len
+    def __multi_seq_encode__(self, batch_size, seqs):
+        '''
+        该函数是多sentence的表征器，每个sentence都加[CLS]与[SEP]
+        :param batch_size:
+        :param seqs:
+        :return:
+        '''
+        assert hasattr(self, "max_sentences") and hasattr(self, "max_sentence_length")
+        max_sentence_len = 0
+        max_sentence_num = 0
+        if self.seq_subword:
+            seq_encoded_list = []
+            for cur_sample_seqs in seqs:
+                cur_seq_encoded_list = []
+                if len(cur_sample_seqs) > self.max_sentences:
+                    # 每个样本最多cur_sample_seqs个
+                    if self.trunc_type == "left":
+                        cur_sample_seqs = cur_sample_seqs[-self.max_sentences:]
+                    else:
+                        cur_sample_seqs = cur_sample_seqs[:self.max_sentences]
+                if max_sentence_num < len(cur_sample_seqs):
+                    max_sentence_num = len(cur_sample_seqs)
+                for seq_idx, seq_str in enumerate(cur_sample_seqs):
+                    seq_to_list = self.seq_subword.process_line(seq_str.upper()).split(" ")
+                    seq = " ".join(seq_to_list)
+                    inputs = self.seq_tokenizer.encode_plus(
+                        seq,
+                        None,
+                        add_special_tokens=False,
+                        max_length=self.max_sentence_length,
+                        truncation=True
+                    )
+                    if self.prepend_bos:
+                        inputs["input_ids"] = [self.cls_idx] + inputs["input_ids"]
+                    if self.append_eos:
+                        inputs["input_ids"] = inputs["input_ids"] + [self.eos_idx]
+                    if max_sentence_len < len(inputs["input_ids"]):
+                        max_sentence_len = len(inputs["input_ids"])
+                    cur_seq_encoded_list.append(inputs["input_ids"])
+                seq_encoded_list.append(cur_seq_encoded_list)
+        else:
+            seq_encoded_list = []
+            for cur_sample_seqs in seqs:
+                cur_seq_encoded_list = []
+                if len(cur_sample_seqs) > self.max_sentences:
+                    # 每个样本最多cur_sample_seqs个
+                    if self.trunc_type == "left":
+                        cur_sample_seqs = cur_sample_seqs[-self.max_sentences:]
+                    else:
+                        cur_sample_seqs = cur_sample_seqs[:self.max_sentences]
+                if max_sentence_num < len(cur_sample_seqs):
+                    max_sentence_num = len(cur_sample_seqs)
+                for seq_idx, seq_str in enumerate(cur_sample_seqs):
+                    if len(seq_str) > self.max_sentence_length:
+                        if self.trunc_type == "left":
+                            seq_str = seq_str[-self.max_sentence_length:]
+                        else:
+                            seq_str = seq_str[:self.max_sentence_length]
+                    inputs = self.seq_tokenizer.encode(seq_str.upper())
+                    # print("len:%d, %s" % (len(seq_str), seq_str.upper()))
+                    if self.prepend_bos:
+                        inputs = [self.cls_idx] + inputs
+                    if self.append_eos:
+                        inputs = inputs + [self.eos_idx]
+                    # print("inputs:%d, " %len(inputs), inputs)
+                    cur_seq_encoded_list.append(inputs)
+                    if max_sentence_len < len(inputs):
+                        max_sentence_len = len(inputs)
+            seq_encoded_list.append(cur_seq_encoded_list)
+        # for input
+        input_ids = torch.empty(
+            (
+                batch_size,
+                max_sentence_num,
+                max_sentence_len,
+            ),
+            dtype=torch.int64,
+        )
+        input_ids.fill_(self.padding_idx)
+        position_ids = None
+        if not self.no_position_embeddings:
+            position_ids = torch.empty(
+                (
+                    batch_size,
+                    max_sentence_num,
+                    max_sentence_len
+                ),
+                dtype=torch.int64,
+            )
+            position_ids.fill_(self.padding_idx)
+        token_type_ids = None
+        if not self.no_position_embeddings:
+            token_type_ids = torch.empty(
+                (
+                    batch_size,
+                    max_sentence_num,
+                    max_sentence_len
+                ),
+                dtype=torch.int64,
+            )
+            token_type_ids.fill_(self.padding_idx)
+        attention_masks = torch.empty(
+            (
+                batch_size,
+                max_sentence_num,
+                max_sentence_len
+            ),
+            dtype=torch.int64,
+        )
+        attention_masks.fill_(0)
+        return seq_encoded_list, input_ids, position_ids, token_type_ids, attention_masks, max_sentence_num, max_sentence_len
+    def __atom_seq_encode__(self, batch_size, seqs):
+        '''
+        该函数不加特殊字符[CLS]与[SEP]
+        :param batch_size:
+        :param seqs:
+        :return:
+        '''
+        seq_encoded_list = []
+        for seq_idx, cur_seq in enumerate(seqs):
+            if isinstance(cur_seq, str): # smiles
+                cur_seq_encoded = self.atom_tokenizer.encode_smi(cur_seq,
+                                                                 prepend_bos=False,
+                                                                 append_eos=False)
+            elif isinstance(cur_seq, list): # atom list
+                cur_seq_encoded = self.atom_tokenizer.encode(cur_seq,
+                                                             prepend_bos=False,
+                                                             append_eos=False)
+            else:
+                raise Exception("not support molecule input type:", type(cur_seq))
+            # 该长度已经减去了需要增加的特殊字符的个数
+            if self.atom_truncation_seq_length:
+                cur_seq_encoded = cur_seq_encoded[:self.atom_truncation_seq_length]
+            seq_encoded_list.append(cur_seq_encoded)
+        max_len = max(len(seq_encoded) for seq_encoded in seq_encoded_list)
+        max_len = max_len + int(self.atom_prepend_bos) + int(self.atom_append_eos)
+        # for input
+        input_ids = torch.empty(
+            (
+                batch_size,
+                max_len,
+            ),
+            dtype=torch.int64,
+        )
+        input_ids.fill_(self.atom_padding_idx)
+        position_ids = None
+        if not self.no_position_embeddings:
+            position_ids = torch.empty(
+                (
+                    batch_size,
+                    max_len,
+                ),
+                dtype=torch.int64,
+            )
+            position_ids.fill_(self.atom_padding_idx)
+        token_type_ids = None
+        if not self.no_position_embeddings:
+            token_type_ids = torch.empty(
+                (
+                    batch_size,
+                    max_len,
+                ),
+                dtype=torch.int64,
+            )
+            token_type_ids.fill_(self.atom_padding_idx)
+        attention_masks = torch.empty(
+            (
+                batch_size,
+                max_len,
+            ),
+            dtype=torch.int64,
+        )
+        attention_masks.fill_(0)
+        return seq_encoded_list, input_ids, position_ids, token_type_ids, attention_masks, max_len
+    def __vector_encode__(self, batch_size, vectors):
+        embedding_vector_dim = vectors[0].shape[0]
+        filled_vectors = torch.empty(
+            (
+                batch_size,
+                embedding_vector_dim
+            ),
+            dtype=torch.float32,
+        )
+        filled_vectors.fill_(0.0)
+        return filled_vectors, 1
+    def __atom_vector_encode__(self, batch_size, vectors):
+        return self.__vector_encode__(batch_size, vectors)
+    def __multi_vector_encode__(self, batch_size, vectors):
+        embedding_vector_dim = vectors[0][0].shape[0]
+        filled_vectors = torch.empty(
+            (
+                batch_size,
+                self.max_sentences,
+                embedding_vector_dim
+            ),
+            dtype=torch.float32,
+        )
+        filled_vectors.fill_(0.0)
+        return filled_vectors, self.max_sentences, 1
+    def __matrix_encode__(self, batch_size, matrices):
+        '''
+        该函数不加特殊字符[CLS]与[SEP]的向量
+        :param batch_size:
+        :param matrices:
+        :return:
+        '''
+        max_len = max(matrix.shape[0] for matrix in matrices)
+        if self.matrix_add_special_token:
+            max_len -= 2
+        if self.truncation_matrix_length:
+            max_len = min(max_len, self.truncation_matrix_length)
+        if self.matrix_add_special_token:
+            max_len += 2
+        else:
+            max_len = max_len + int(self.prepend_bos) + int(self.append_eos)
+        embedding_vector_dim = matrices[0].shape[1]
+        # for input
+        filled_matrices = torch.empty(
+            (
+                batch_size,
+                max_len,
+                embedding_vector_dim
+            ),
+            dtype=torch.float32,
+        )
+        filled_matrices.fill_(0.0)
+        attention_masks = torch.empty(
+            (
+                batch_size,
+                max_len,
+            ),
+            dtype=torch.int64,
+        )
+        attention_masks.fill_(0)
+        return filled_matrices, attention_masks, max_len
+    def __atom_matrix_encode__(self, batch_size, matrices):
+        '''
+        该函数不加特殊字符[CLS]与[SEP]的向量
+        :param batch_size:
+        :param matrices:
+        :return:
+        '''
+        max_len = max(matrix.shape[0] for matrix in matrices)
+        if self.matrix_add_special_token:
+            max_len -= 2
+        if self.atom_truncation_matrix_length:
+            max_len = min(max_len, self.atom_truncation_matrix_length)
+        if self.matrix_add_special_token:
+            max_len += 2
+        else:
+            max_len = max_len + int(self.atom_prepend_bos) + int(self.atom_append_eos)
+        embedding_vector_dim = matrices[0].shape[1]
+        # for input
+        filled_matrices = torch.empty(
+            (
+                batch_size,
+                max_len,
+                embedding_vector_dim
+            ),
+            dtype=torch.float32,
+        )
+        filled_matrices.fill_(0.0)
+        attention_masks = torch.empty(
+            (
+                batch_size,
+                max_len,
+            ),
+            dtype=torch.int64,
+        )
+        attention_masks.fill_(0)
+        return filled_matrices, attention_masks, max_len
+    def __multi_matrix_encode__(self, batch_size, matrices):
+        '''
+        该函数不加特殊字符[CLS]与[SEP]的向量
+        :param batch_size:
+        :param matrices:
+        :return:
+        '''
+        max_sentence_num = max(len(cur_matrix) for cur_matrix in matrices)
+        max_sentence_num = min(max_sentence_num, self.max_sentences)
+        if self.trunc_type == "left":
+            max_sentence_len = max(max(matrix.shape[0] for matrix in cur_matrix[-max_sentence_num:]) for cur_matrix in matrices)
+        else:
+            max_sentence_len = max(max(matrix.shape[0] for matrix in cur_matrix[:max_sentence_num]) for cur_matrix in matrices)
+        # print("encoder max_sentence_num:%d, max_sentence_len: %d" % (max_sentence_num, max_sentence_len))
+        if self.matrix_add_special_token:
+            max_sentence_len -= 2
+        max_sentence_len = min(max_sentence_len, self.max_sentence_length)
+        # print("encoder max_sentence_num:%d, max_sentence_len: %d" % (max_sentence_num, max_sentence_len))
+        if self.matrix_add_special_token:
+            max_sentence_len += 2
+        else:
+            max_sentence_len = max_sentence_len + int(self.prepend_bos) + int(self.append_eos)
+        # print("encoder max_sentence_num:%d, max_sentence_len: %d" % (max_sentence_num, max_sentence_len))
+        # print("self.max_sentence_length: %d" % self.max_sentence_length)
+        # print("max_sentence_len: %d" % max_sentence_len)
+        embedding_vector_dim = matrices[0][0].shape[1]
+        # for input
+        filled_matrices = torch.empty(
+            (
+                batch_size,
+                max_sentence_num,
+                max_sentence_len,
+                embedding_vector_dim
+            ),
+            dtype=torch.float32,
+        )
+        filled_matrices.fill_(0.0)
+        attention_masks = torch.empty(
+            (
+                batch_size,
+                max_sentence_num,
+                max_sentence_len
+            ),
+            dtype=torch.int64,
+        )
+        attention_masks.fill_(0)
+        return filled_matrices, attention_masks, max_sentence_num, max_sentence_len
+    def __call_single__(self, batch_size, seq_types, seqs, vectors, matrices, labels):
+        max_length = sys.maxsize
+        input_ids, position_ids, token_type_ids, seq_attention_masks = None, None, None, None
+        seq_part_of_input = False
+        molecule_flag = False
+        multi_seq_flag = False
+        if seqs:
+            new_seqs = []
+            for seq_idx, seq_type in enumerate(seq_types):
+                if seq_type == "gene":
+                    new_seqs.append(gene_seq_replace(seqs[seq_idx].upper()))
+                elif seq_type == "molecule":
+                    if isinstance(seqs[seq_idx], str):
+                        new_seqs.append(AlphabetAtom.smiles_2_atom_seq(seqs[seq_idx]))
+                    else:
+                        new_seqs.append(seqs[seq_idx])
+                    molecule_flag = True
+                elif seq_type == "multi_gene":
+                    new_seqs.append([gene_seq_replace(seq).upper() for seq in seqs[seq_idx].split(",")])
+                    multi_seq_flag = True
+                elif seq_type == "multi_prot":
+                    new_seqs.append([seq.upper() for seq in seqs[seq_idx].split(",")])
+                    multi_seq_flag = True
+                else:
+                    new_seqs.append(seqs[seq_idx].upper())
+            if molecule_flag:
+                # seq_encoded_list没有加特殊字符，input_ids标志位来占位， seq_max_length 根据标志位来加特殊字符长度
+                seq_encoded_list, input_ids, position_ids, token_type_ids, seq_attention_masks, seq_max_length = self.__atom_seq_encode__(
+                    batch_size=batch_size, seqs=new_seqs)
+            elif multi_seq_flag:
+                # seq_encoded_list根据标志位来加特殊字符，input_ids根据标志位来加特殊字符， seq_max_len 根据标志位来加特殊字符长度
+                seq_encoded_list, input_ids, position_ids, token_type_ids, seq_attention_masks, seq_max_num, seq_max_len = self.__multi_seq_encode__(
+                    batch_size=batch_size, seqs=new_seqs)
+                '''
+                print("seq_max_num: %d" % seq_max_num)
+                print("seq_max_len: %d" % seq_max_len)
+                print(input_ids.shape)
+                print("len(seq_encoded_list): %d" % len(seq_encoded_list))
+                for input_id in input_ids:
+                    print(len(input_id))
+                    for matrix in input_id:
+                        print(matrix.shape)
+                    print("*****")
+                '''
+            else:
+                # seq_encoded_list没有加特殊字符，input_ids标志位来占位， seq_max_length 根据标志位来加特殊字符长度
+                seq_encoded_list, input_ids, position_ids, token_type_ids, seq_attention_masks, seq_max_length = self.__seq_encode__(
+                    batch_size=batch_size, seqs=new_seqs)
+            if multi_seq_flag:
+                max_length = min(max_length, seq_max_num * seq_max_len)
+            else:
+                max_length = min(max_length, seq_max_length)
+            seq_part_of_input = True
+        encoded_vectors = None
+        vector_part_of_input = False
+        if vectors is not None and len(vectors) > 0:
+            if multi_seq_flag:
+                encoded_vectors, vector_max_num, vector_max_len = self.__multi_vector_encode__(batch_size=batch_size, vectors=vectors)
+            elif molecule_flag:
+                encoded_vectors, vector_max_length = self.__atom_vector_encode__(batch_size=batch_size, vectors=vectors)
+            else:
+                encoded_vectors, vector_max_length = self.__vector_encode__(batch_size=batch_size, vectors=vectors)
+            # max_length = min(max_length, vector_max_length)
+            vector_part_of_input = True
+        encoded_matrices, matrix_attention_masks = None, None
+        matrix_part_of_input = False
+        # print("multi_seq_flag:", multi_seq_flag)
+        if matrices is not None and len(matrices) > 0:
+            if multi_seq_flag:
+                # 根据标记位填充，��据标记位填充，句子数量，根据标记位是否加上特殊字符长度
+                encoded_matrices, matrix_attention_masks, matrix_max_num, matrix_max_len = self.__multi_matrix_encode__(
+                    batch_size=batch_size,
+                    matrices=matrices)
+                '''
+                print("matrix_max_num: %d" % matrix_max_num)
+                print("matrix_max_len: %d" % matrix_max_len)
+                print(encoded_matrices.shape)
+                print("len(matrices): %d" % len(matrices))
+                for matrix_array in matrices:
+                    print(len(matrix_array))
+                    for matrix in matrix_array:
+                        print(matrix.shape)
+                    print("*****")
+                '''
+            elif molecule_flag:
+                # 根据标记位填充，根据标记位填充，句子数量，根据标记位是否加上特殊字符长度
+                encoded_matrices, matrix_attention_masks, matrix_max_length = self.__atom_matrix_encode__(batch_size=batch_size,
+                                                                                                          matrices=matrices
+                                                                                                          )
+            else:
+                # 根据标记位填充，根据标记位填充，句子数量，根据标记位是否加上特殊字符长度
+                encoded_matrices, matrix_attention_masks, matrix_max_length = self.__matrix_encode__(batch_size=batch_size,
+                                                                                                     matrices=matrices)
+            if multi_seq_flag:
+                max_length = min(max_length, matrix_max_num * matrix_max_len)
+            else:
+                max_length = min(max_length, matrix_max_length)
+            matrix_part_of_input = True
+        has_label = False
+        if labels:
+            has_label = True
+        new_labels = []
+        num_sentences = 1
+        sentence_length = 1
+        for sample_idx in range(batch_size):
+            # seq
+            if seq_part_of_input:
+                if multi_seq_flag:
+                    # cls_idx 已经添加
+                    pass
+                elif not molecule_flag and self.prepend_bos:
+                    input_ids[sample_idx, 0] = self.cls_idx
+                elif molecule_flag and self.atom_prepend_bos:
+                    input_ids[sample_idx, 0] = self.atom_cls_idx
+                seq_encoded = seq_encoded_list[sample_idx]
+                real_seq_len = len(seq_encoded)
+                # seq_tensor = torch.tensor(seq_encoded, dtype=torch.int64)
+                # print("seq_encoded：")
+                # print(seq_encoded)
+                if multi_seq_flag:
+                    cur_seq_num = min(len(seq_encoded), seq_max_num)
+                    if len(seq_encoded) > cur_seq_num:
+                        if self.trunc_type == "left":
+                            seq_encoded = seq_encoded[-cur_seq_num:]
+                        else:
+                            seq_encoded = seq_encoded[cur_seq_num:]
+                    if num_sentences < cur_seq_num:
+                        num_sentences = cur_seq_num
+                    # print("cur_seq_num: %d" % len(seq_encoded))
+                    for seq_idx in range(cur_seq_num):
+                        cur_seq = seq_encoded[seq_idx]
+                        cur_seq_len = min(len(cur_seq), seq_max_len)
+                        '''
+                        print("cur_seq:")
+                        print(cur_seq_len)
+                        print("input_ids:")
+                        print(input_ids.shape)
+                        '''
+                        input_ids[sample_idx, seq_idx, :cur_seq_len] = torch.tensor(cur_seq[:cur_seq_len], dtype=torch.int64)
+                        seq_attention_masks[sample_idx, seq_idx, :cur_seq_len] = 1
+                        if cur_seq_len > sentence_length:
+                            sentence_length = cur_seq_len
+                elif molecule_flag:
+                    seq_tensor = torch.tensor(seq_encoded, dtype=torch.int64)
+                    input_ids[sample_idx, int(self.atom_prepend_bos): real_seq_len + int(self.atom_prepend_bos)] = seq_tensor
+                    cur_sentence_length = int(self.atom_prepend_bos) + real_seq_len + int(self.atom_prepend_bos)
+                    if cur_sentence_length > sentence_length:
+                        sentence_length = cur_sentence_length
+                else:
+                    seq_tensor = torch.tensor(seq_encoded, dtype=torch.int64)
+                    input_ids[sample_idx, int(self.prepend_bos): real_seq_len + int(self.prepend_bos)] = seq_tensor
+                    cur_sentence_length = int(self.prepend_bos) + real_seq_len + int(self.prepend_bos)
+                    if cur_sentence_length > sentence_length:
+                        sentence_length = cur_sentence_length
+                if multi_seq_flag:
+                    # eos_idx 已经添加
+                    pass
+                elif not molecule_flag and self.append_eos:
+                    input_ids[sample_idx, real_seq_len + int(self.prepend_bos)] = self.eos_idx
+                elif molecule_flag and self.atom_append_eos:
+                    input_ids[sample_idx, real_seq_len + int(self.atom_prepend_bos)] = self.atom_eos_idx
+                if multi_seq_flag:
+                    cur_len = num_sentences * sentence_length
+                elif molecule_flag:
+                    cur_len = int(self.atom_prepend_bos) + real_seq_len + int(self.atom_append_eos)
+                else:
+                    cur_len = int(self.prepend_bos) + real_seq_len + int(self.append_eos)
+                if not self.no_position_embeddings:
+                    if multi_seq_flag:
+                        for pos_idx in range(0, cur_len):
+                            position_ids[sample_idx, pos_idx//sentence_length, pos_idx % sentence_length] = pos_idx % sentence_length
+                    else:
+                        for pos_idx in range(0, cur_len):
+                            position_ids[sample_idx, pos_idx] = pos_idx
+                if not self.no_token_type_embeddings:
+                    seq_type = seq_types[sample_idx]
+                    if seq_type == "gene":
+                        type_value = 0
+                    else:
+                        type_value = 1
+                    if multi_seq_flag:
+                        for pos_idx in range(0, cur_len):
+                            token_type_ids[sample_idx, pos_idx//sentence_length, pos_idx % sentence_length] = type_value
+                    else:
+                        for pos_idx in range(0, cur_len):
+                            token_type_ids[sample_idx, pos_idx] = type_value
+                if multi_seq_flag:
+                    pass
+                else:
+                    seq_attention_masks[sample_idx, 0: cur_len] = 1
+            # vector
+            if vector_part_of_input:
+                if multi_seq_flag:
+                    cur_vector_num = min(len(vectors[sample_idx]), vector_max_num)
+                    if num_sentences < cur_vector_num:
+                        num_sentences = cur_vector_num
+                    for vector_idx in range(cur_vector_num):
+                        encoded_vectors[sample_idx, vector_idx, :] = torch.tensor(vectors[sample_idx][vector_idx], dtype=torch.float32)
+                else:
+                    encoded_vectors[sample_idx, :] = torch.tensor(vectors[sample_idx], dtype=torch.float32)
+            # matrix
+            if matrix_part_of_input:
+                '''
+                matrix_encoded = matrices[sample_idx]
+                if self.matrix_add_special_token:
+                    real_seq_len = matrix_encoded.shape[0] - 2
+                else:
+                    real_seq_len = matrix_encoded.shape[0]
+                if multi_seq_flag:
+                    pass
+                elif molecule_flag:
+                    # real_seq_len = real_seq_len - int(self.atom_prepend_bos) - int(self.atom_append_eos)
+                    real_seq_len = min(real_seq_len, self.atom_truncation_matrix_length)
+                else:
+                    # real_seq_len = real_seq_len - int(self.prepend_bos) - int(self.append_eos)
+                    real_seq_len = min(real_seq_len, self.truncation_matrix_length)
+                # print("real_seq_len: %d" % real_seq_len)
+                '''
+                if multi_seq_flag:
+                    # 多序列matrix
+                    matrix_encoded_list = matrices[sample_idx]
+                    cur_matrix_num = min(len(matrix_encoded_list), matrix_max_num)
+                    if len(matrix_encoded_list) > cur_matrix_num:
+                        if self.trunc_type == "left":
+                            matrix_encoded_list = matrix_encoded_list[:cur_matrix_num]
+                        else:
+                            matrix_encoded_list = matrix_encoded_list[-cur_matrix_num:]
+                    if num_sentences < cur_matrix_num:
+                        num_sentences = cur_matrix_num
+                    # print("matrix_encoded_list: %d" % len(matrix_encoded_list))
+                    for matrix_idx in range(cur_matrix_num):
+                        # print("matrix_idx: %d" % matrix_idx)
+                        cur_matrix = matrix_encoded_list[matrix_idx]
+                        cur_matrix = torch.tensor(cur_matrix, dtype=torch.float32)
+                        cur_matrix_len = min(cur_matrix.shape[0], matrix_max_len)
+                        if self.matrix_add_special_token:
+                            encoded_matrices[sample_idx, matrix_idx, 0: cur_matrix_len - 1] = cur_matrix[0:cur_matrix_len - 1]
+                            encoded_matrices[sample_idx, matrix_idx, cur_matrix_len - 1] = cur_matrix[-1]
+                            matrix_attention_masks[sample_idx, matrix_idx, 0:cur_matrix_len] = 1
+                        else:
+                            encoded_matrices[sample_idx, matrix_idx, int(self.prepend_bos): cur_matrix_len + int(self.prepend_bos)] = cur_matrix[0:cur_matrix_len]
+                            matrix_attention_masks[sample_idx, matrix_idx, 0: int(self.prepend_bos) + cur_matrix_len + int(self.append_eos)] = 1
+                            cur_matrix_len = int(self.prepend_bos) + cur_matrix_len + int(self.append_eos)
+                        if sentence_length < cur_matrix_len:
+                            sentence_length = cur_matrix_len
+                else:
+                    matrix_encoded = matrices[sample_idx]
+                    if self.matrix_add_special_token:
+                        real_seq_len = matrix_encoded.shape[0] - 2
+                    else:
+                        real_seq_len = matrix_encoded.shape[0]
+                    if molecule_flag:
+                        # real_seq_len = real_seq_len - int(self.atom_prepend_bos) - int(self.atom_append_eos)
+                        real_seq_len = min(real_seq_len, self.atom_truncation_matrix_length)
+                        matrix = torch.tensor(matrix_encoded, dtype=torch.float32)
+                        if self.matrix_add_special_token:
+                            encoded_matrices[sample_idx, 0: real_seq_len + 2] \
+                                = matrix[0: real_seq_len + 2]
+                            matrix_attention_masks[sample_idx, 0: real_seq_len + 2] = 1
+                            cur_sentence_length = real_seq_len + 2
+                        else:
+                            encoded_matrices[sample_idx, int(self.atom_prepend_bos): real_seq_len + int(self.atom_prepend_bos)] \
+                                = matrix[0: real_seq_len]
+                            # matrix_attention_masks[sample_idx, int(self.atom_prepend_bos): real_seq_len + int(self.atom_prepend_bos)] = 1
+                            matrix_attention_masks[sample_idx, 0: int(self.atom_prepend_bos) + real_seq_len + int(self.atom_append_eos)] = 1
+                            cur_sentence_length = int(self.atom_prepend_bos) + real_seq_len + int(self.atom_prepend_bos)
+                        if cur_sentence_length > sentence_length:
+                            sentence_length = cur_sentence_length
+                    else:
+                        # real_seq_len = real_seq_len - int(self.prepend_bos) - int(self.append_eos)
+                        real_seq_len = min(real_seq_len, self.truncation_matrix_length)
+                        matrix = torch.tensor(matrix_encoded, dtype=torch.float32)
+                        if self.matrix_add_special_token:
+                            encoded_matrices[sample_idx, 0: real_seq_len + 2] = matrix[0: real_seq_len + 2]
+                            matrix_attention_masks[sample_idx, 0: real_seq_len + 2] = 1
+                            cur_sentence_length = real_seq_len + 2
+                        else:
+                            encoded_matrices[sample_idx, int(self.prepend_bos): real_seq_len + int(self.prepend_bos)] = matrix[0: real_seq_len]
+                            # matrix_attention_masks[sample_idx, int(self.prepend_bos): real_seq_len + int(self.prepend_bos)] = 1
+                            matrix_attention_masks[sample_idx, 0: int(self.prepend_bos) + real_seq_len + int(self.append_eos)] = 1
+                            cur_sentence_length = int(self.prepend_bos) + real_seq_len + int(self.prepend_bos)
+                        if cur_sentence_length > sentence_length:
+                            sentence_length = cur_sentence_length
+            if has_label:
+                if multi_seq_flag:
+                    # to do
+                    new_labels.append(
+                        self.__parse_label__(max_length, self.task_level_type,
+                                             self.label_size, self.output_mode, labels[sample_idx]))
+                elif molecule_flag:
+                    new_labels.append(
+                        self.__atom_parse_label__(max_length, self.task_level_type,
+                                                  self.label_size, self.output_mode, labels[sample_idx]))
+                else:
+                    new_labels.append(
+                        self.__parse_label__(max_length, self.task_level_type,
+                                             self.label_size, self.output_mode, labels[sample_idx]))
+        if new_labels is not None and new_labels:
+            if self.output_mode in ["regression"]:
+                labels = torch.tensor(new_labels, dtype=torch.float32)
+            else:
+                labels = torch.tensor(new_labels, dtype=torch.int64)
+        else:
+            labels = None
+        '''
+        print(input_ids.shape)
+        print("encoded_matrices:")
+        print(encoded_matrices.shape)
+        print("num_sentences:%d" % num_sentences)
+        print("sentence_length:%d" % sentence_length)
+        if labels is not None:
+            print("labels:")
+            print(labels.shape)
+        '''
+        if multi_seq_flag:
+            if seq_part_of_input:
+                input_ids = torch.reshape(input_ids, (input_ids.shape[0], -1))
+            if matrix_part_of_input:
+                encoded_matrices = torch.reshape(encoded_matrices, (encoded_matrices.shape[0], -1, encoded_matrices.shape[-1]))
+            if position_ids is not None:
+                position_ids = torch.reshape(position_ids, (position_ids.shape[0], -1))
+            if token_type_ids is not None:
+                token_type_ids = torch.reshape(token_type_ids, (token_type_ids.shape[0], -1))
+            if seq_attention_masks is not None:
+                seq_attention_masks = torch.reshape(seq_attention_masks, (seq_attention_masks.shape[0], -1))
+            if matrix_attention_masks is not None:
+                matrix_attention_masks = torch.reshape(matrix_attention_masks, (matrix_attention_masks.shape[0], -1))
+        '''
+        print(input_ids.shape)
+        print("encoded_matrices:")
+        print(encoded_matrices.shape)
+        print("num_sentences:%d" % num_sentences)
+        print("sentence_length:%d" % sentence_length)
+        if labels is not None:
+            print("labels:")
+            print(labels.shape)
+        print("-" * 50)
+        '''
+        return input_ids, \
+               position_ids, \
+               token_type_ids, \
+               seq_attention_masks, \
+               encoded_vectors, \
+               encoded_matrices, \
+               matrix_attention_masks, \
+               num_sentences, \
+               sentence_length, \
+               labels
+    def __call__(self, raw_batch: Sequence[dict]):
+        batch_size = len(raw_batch)
+        # pair
+        if "seq_id_a" in raw_batch[0] and "seq_id_b" in raw_batch[0]:
+            res = {}
+            # seq_ids_a = []
+            seq_types_a = []
+            seqs_a = []
+            vectors_a = []
+            matrices_a = []
+            # seq_ids_b = []
+            seq_types_b = []
+            seqs_b = []
+            vectors_b = []
+            matrices_b = []
+            labels = []
+            for item in raw_batch:
+                # seq_ids_a.append(item["seq_id_a"])
+                seq_types_a.append(item["seq_type_a"])
+                if item["seq_a"] is not None:
+                    seqs_a.append(item["seq_a"])
+                if item["vector_a"] is not None:
+                    vectors_a.append(item["vector_a"])
+                if item["matrix_a"] is not None:
+                    matrices_a.append(item["matrix_a"])
+                # seq_ids_b.append(item["seq_id_b"])
+                seq_types_b.append(item["seq_type_b"])
+                if item["seq_b"] is not None:
+                    seqs_b.append(item["seq_b"])
+                if item["vector_b"] is not None:
+                    vectors_b.append(item["vector_b"])
+                if item["matrix_b"] is not None:
+                    matrices_b.append(item["matrix_b"])
+                if "label" in item and item["label"] is not None:
+                    labels.append(item["label"])
+            input_ids_a, position_ids_a, token_type_ids_a, seq_attention_masks_a, encoded_vectors_a, encoded_matrices_a, matrix_attention_masks_a, num_sentences_a, sentence_length_a, labels \
+                = self.__call_single__(batch_size, seq_types_a, seqs_a, vectors_a, matrices_a, labels)
+            if not hasattr(self, "max_sentences") or self.max_sentences is None:
+                res.update({
+                    "input_ids_a": input_ids_a,
+                    "position_ids_a": position_ids_a,
+                    "token_type_ids_a": token_type_ids_a,
+                    "seq_attention_masks_a": seq_attention_masks_a,
+                    "vectors_a": encoded_vectors_a,
+                    "matrices_a": encoded_matrices_a,
+                    "matrix_attention_masks_a": matrix_attention_masks_a,
+                    "labels": labels if labels is not None and len(labels) > 0 else None
+                })
+            else:
+                res.update({
+                    "input_ids_a": input_ids_a,
+                    "position_ids_a": position_ids_a,
+                    "token_type_ids_a": token_type_ids_a,
+                    "seq_attention_masks_a": seq_attention_masks_a,
+                    "vectors_a": encoded_vectors_a,
+                    "matrices_a": encoded_matrices_a,
+                    "matrix_attention_masks_a": matrix_attention_masks_a,
+                    "num_sentences_a": num_sentences_a,
+                    "sentence_length_a": sentence_length_a,
+                    "labels": labels if labels is not None and len(labels) > 0 else None
+                })
+            input_ids_b, position_ids_b, token_type_ids_b, seq_attention_masks_b, encoded_vectors_b, encoded_matrices_b, matrix_attention_masks_b, num_sentences_b, sentence_length_b,  _ \
+                = self.__call_single__(batch_size, seq_types_b, seqs_b, vectors_b, matrices_b, labels=None)
+            if not hasattr(self, "max_sentences") or self.max_sentences is None:
+                res.update({
+                    "input_ids_b": input_ids_b,
+                    "position_ids_b": position_ids_b,
+                    "token_type_ids_b": token_type_ids_b,
+                    "seq_attention_masks_b": seq_attention_masks_b,
+                    "vectors_b": encoded_vectors_b,
+                    "matrices_b": encoded_matrices_b,
+                    "matrix_attention_masks_b": matrix_attention_masks_b
+                })
+            else:
+                res.update({
+                    "input_ids_b": input_ids_b,
+                    "position_ids_b": position_ids_b,
+                    "token_type_ids_b": token_type_ids_b,
+                    "seq_attention_masks_b": seq_attention_masks_b,
+                    "vectors_b": encoded_vectors_b,
+                    "matrices_b": encoded_matrices_b,
+                    "num_sentences_b": num_sentences_b,
+                    "sentence_length_b": sentence_length_b,
+                    "matrix_attention_masks_b": matrix_attention_masks_b
+                })
+            return res
+        else:
+            res = {}
+            # seq_ids = []
+            seq_types = []
+            seqs = []
+            vectors = []
+            matrices = []
+            labels = []
+            for item in raw_batch:
+                # seq_ids.append(item["seq_id"])
+                seq_types.append(item["seq_type"])
+                if item["seq"] is not None:
+                    seqs.append(item["seq"])
+                if item["vector"] is not None:
+                    vectors.append(item["vector"])
+                if item["matrix"] is not None:
+                    matrices.append(item["matrix"])
+                if item["label"] is not None:
+                    labels.append(item["label"])
+            '''
+            print("seqs:")
+            print(seqs)
+            print([len(seq) for seq in seqs])
+            print("matrices:")
+            print(matrices)
+            print([matrix.shape for matrix in matrices])
+            print("labels:")
+            print(labels)
+            print([len(eval(label)) for label in labels])
+            '''
+            input_ids, position_ids, token_type_ids, seq_attention_masks, encoded_vectors, encoded_matrices, matrix_attention_masks, num_sentences, sentence_length, labels = self.__call_single__(
+                batch_size, seq_types, seqs, vectors, matrices, labels=labels)
+            if not hasattr(self, "max_sentences") or self.max_sentences is None:
+                res.update({
+                    "input_ids": input_ids,
+                    "position_ids": position_ids,
+                    "token_type_ids": token_type_ids,
+                    "seq_attention_masks": seq_attention_masks,
+                    "vectors": encoded_vectors,
+                    "matrices": encoded_matrices,
+                    "matrix_attention_masks": matrix_attention_masks,
+                    "labels": labels if labels is not None and len(labels) > 0 else None
+                })
+            else:
+                res.update({
+                    "input_ids": input_ids,
+                    "position_ids": position_ids,
+                    "token_type_ids": token_type_ids,
+                    "seq_attention_masks": seq_attention_masks,
+                    "vectors": encoded_vectors,
+                    "matrices": encoded_matrices,
+                    "matrix_attention_masks": matrix_attention_masks,
+                    "num_sentences": num_sentences,
+                    "sentence_length": sentence_length,
+                    "labels": labels if labels is not None and len(labels) > 0 else None
+                })
+            '''
+            for item in res.items():
+                key_name = item[0]
+                print(key_name, ":")
+                if item[1] is not None:
+                    print(item[1])
+                    print(item[1].shape)
+                else:
+                    print("None")
+            '''
+            return res

classification_loss.py ADDED Viewed

	@@ -0,0 +1,296 @@

+#!/usr/bin/env python
+# encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: [email protected]
+@tel: 137****6540
+@datetime: 2023/5/3 20:35
+@project: LucaOne
+@file: loss.py
+@desc: loss
+'''
+import torch.nn as nn
+import torch.nn.functional as F
+from .masked_loss import _MaskedLoss
+class MaskedFocalLoss(_MaskedLoss):
+    """Masked FocalLoss"""
+    def __init__(self, alpha=1, gamma=2, normalization=False, reduction='mean', ignore_nans=True, ignore_value=-100):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = FocalLoss(alpha=alpha, gamma=gamma, normalization=normalization, reduction='none')
+class FocalLoss(nn.Module):
+    '''
+    Focal loss
+    '''
+    def __init__(self, alpha=1, gamma=2, normalization=False, reduction="mean"):
+        super(FocalLoss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.normalization = normalization
+        self.reduction = reduction
+    def forward(self, inputs, targets):
+        if self.normalization:
+            '''
+             reduction: the operation on the output loss, which can be set to 'none', 'mean', and 'sum';
+            'none' will not perform any processing on the loss,
+            'mean' will calculate the mean of the loss,
+            'sum' will sum the loss, and the default is 'mean'
+            '''
+            bce = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
+            probs = torch.sigmoid(inputs)
+        else:
+            bce = F.binary_cross_entropy(inputs, targets, reduction='none')
+            probs = inputs
+        pt = targets * probs + (1 - targets) * (1 - probs)
+        modulate = 1 if self.gamma is None else (1 - pt) ** self.gamma
+        focal_loss = modulate * bce
+        if self.alpha is not None:
+            assert 0 <= self.alpha <= 1
+            alpha_weights = targets * self.alpha + (1 - targets) * (1 - self.alpha)
+            focal_loss *= alpha_weights
+        if self.reduction == "mean":
+            # global mean
+            return torch.mean(focal_loss)
+        if self.reduction in ["summean", "meansum"]:
+            # sum of all samples and calc the mean value
+            return torch.mean(torch.sum(focal_loss, dim=1))
+        elif self.reduction == "sum":
+            return torch.sum(focal_loss, dim=1)
+        else:
+            return focal_loss
+class MaskedMultiLabelCCE(_MaskedLoss):
+    """Masked MultiLabel CCE"""
+    def __init__(self, normalization=False, reduction='mean', ignore_nans=True, ignore_value=-100):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = MultiLabelCCE(normalization=normalization, reduction='none')
+class MultiLabelCCE(nn.Module):
+    '''
+    Multi Label CCE
+    '''
+    def __init__(self, normalization=False, reduction='mean'):
+        super(MultiLabelCCE, self).__init__()
+        self.normalization = normalization
+        self.reduction = reduction
+    def forward(self, inputs, targets):
+        """
+        Cross entropy of multi-label classification
+        Note：The shapes of y_true and y_pred are consistent, and the elements of y_true are either 0 or 1. 1 indicates
+        that the corresponding class is a target class, and 0 indicates that the corresponding class is a non-target class.
+        """
+        if self.normalization:
+            y_pred = torch.softmax(inputs, dim=-1)
+        else:
+            y_pred = inputs
+        y_true = targets
+        y_pred = (1 - 2 * y_true) * y_pred
+        y_pred_neg = y_pred - y_true * 1e12
+        y_pred_pos = y_pred - (1 - y_true) * 1e12
+        zeros = torch.zeros_like(y_pred[..., :1])
+        y_pred_neg = torch.cat((y_pred_neg, zeros), axis=-1)
+        y_pred_pos = torch.cat((y_pred_pos, zeros), axis=-1)
+        neg_loss = torch.logsumexp(y_pred_neg,  axis=-1)
+        pos_loss = torch.logsumexp(y_pred_pos,  axis=-1)
+        if self.reduction == 'mean':
+            return torch.mean(neg_loss + pos_loss)
+        elif self.reduction == 'sum':
+            return torch.sum(neg_loss + pos_loss)
+        else:
+            return neg_loss + pos_loss
+class MaskedAsymmetricLoss(_MaskedLoss):
+    """Masked AsymmetricLoss"""
+    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=False, reduction='mean', ignore_nans=True, ignore_value=-100):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = AsymmetricLoss(gamma_neg, gamma_pos, clip, eps, disable_torch_grad_focal_loss)
+class AsymmetricLoss(nn.Module):
+    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True):
+        super(AsymmetricLoss, self).__init__()
+        self.gamma_neg = gamma_neg
+        self.gamma_pos = gamma_pos
+        self.clip = clip
+        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
+        self.eps = eps
+    def forward(self, x, y):
+        """"
+        Parameters
+        ----------
+        x: input logits
+        y: targets (multi-label binarized vector)
+        """
+        # Calculating Probabilities
+        x_sigmoid = torch.sigmoid(x)
+        xs_pos = x_sigmoid
+        xs_neg = 1 - x_sigmoid
+        # Asymmetric Clipping
+        if self.clip is not None and self.clip > 0:
+            xs_neg = (xs_neg + self.clip).clamp(max=1)
+        # Basic CE calculation
+        los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
+        los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
+        loss = los_pos + los_neg
+        # Asymmetric Focusing
+        if self.gamma_neg > 0 or self.gamma_pos > 0:
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(False)
+            pt0 = xs_pos * y
+            pt1 = xs_neg * (1 - y)  # pt = p if t > 0 else 1-p
+            pt = pt0 + pt1
+            one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
+            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(True)
+            loss *= one_sided_w
+        return -loss.sum()
+class MaskedAsymmetricLossOptimized(_MaskedLoss):
+    """Masked ASLSingleLabel loss"""
+    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=False, reduction='mean', ignore_nans=True, ignore_value=-100):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = AsymmetricLossOptimized(gamma_neg, gamma_pos, clip, eps, disable_torch_grad_focal_loss)
+class AsymmetricLossOptimized(nn.Module):
+    '''
+    Notice - optimized version, minimizes memory allocation and gpu uploading,
+    favors inplace operations
+    '''
+    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=False):
+        super(AsymmetricLossOptimized, self).__init__()
+        self.gamma_neg = gamma_neg
+        self.gamma_pos = gamma_pos
+        self.clip = clip
+        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
+        self.eps = eps
+        # prevent memory allocation and gpu uploading every iteration, and encourages inplace operations
+        self.targets = self.anti_targets = self.xs_pos = self.xs_neg = self.asymmetric_w = self.loss = None
+    def forward(self, x, y):
+        """"
+        Parameters
+        ----------
+        x: input logits
+        y: targets (multi-label binarized vector)
+        """
+        self.targets = y
+        self.anti_targets = 1 - y
+        # Calculating Probabilities
+        self.xs_pos = torch.sigmoid(x)
+        self.xs_neg = 1.0 - self.xs_pos
+        # Asymmetric Clipping
+        if self.clip is not None and self.clip > 0:
+            self.xs_neg.add_(self.clip).clamp_(max=1)
+        # Basic CE calculation
+        self.loss = self.targets * torch.log(self.xs_pos.clamp(min=self.eps))
+        self.loss.add_(self.anti_targets * torch.log(self.xs_neg.clamp(min=self.eps)))
+        # Asymmetric Focusing
+        if self.gamma_neg > 0 or self.gamma_pos > 0:
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(False)
+            self.xs_pos = self.xs_pos * self.targets
+            self.xs_neg = self.xs_neg * self.anti_targets
+            self.asymmetric_w = torch.pow(1 - self.xs_pos - self.xs_neg,
+                                          self.gamma_pos * self.targets + self.gamma_neg * self.anti_targets)
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(True)
+            self.loss *= self.asymmetric_w
+        return -self.loss.sum()
+class MaskedASLSingleLabel(_MaskedLoss):
+    """Masked ASLSingleLabel loss"""
+    def __init__(self, gamma_pos=0, gamma_neg=4, eps: float = 0.1, reduction='mean', ignore_nans=True, ignore_value=-100):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = ASLSingleLabel(gamma_pos, gamma_neg, eps, reduction='none')
+class ASLSingleLabel(nn.Module):
+    '''
+    This loss is intended for single-label classification problems（multi-class）
+    '''
+    def __init__(self, gamma_pos=0, gamma_neg=4, eps: float = 0.1, reduction='mean'):
+        super(ASLSingleLabel, self).__init__()
+        self.eps = eps
+        self.logsoftmax = nn.LogSoftmax(dim=-1)
+        self.targets_classes = []
+        self.gamma_pos = gamma_pos
+        self.gamma_neg = gamma_neg
+        self.reduction = reduction
+    def forward(self, inputs, target):
+        '''
+        "input" dimensions: - (batch_size, number_classes)
+        "target" dimensions: - (batch_size)
+        '''
+        num_classes = inputs.size()[-1]
+        log_preds = self.logsoftmax(inputs)
+        self.targets_classes = torch.zeros_like(inputs).scatter_(1, target.long().unsqueeze(1), 1)
+        # ASL weights
+        targets = self.targets_classes
+        anti_targets = 1 - targets
+        xs_pos = torch.exp(log_preds)
+        xs_neg = 1 - xs_pos
+        xs_pos = xs_pos * targets
+        xs_neg = xs_neg * anti_targets
+        asymmetric_w = torch.pow(1 - xs_pos - xs_neg, self.gamma_pos * targets + self.gamma_neg * anti_targets)
+        log_preds = log_preds * asymmetric_w
+        if self.eps > 0:
+            # label smoothing
+            self.targets_classes = self.targets_classes.mul(1 - self.eps).add(self.eps / num_classes)
+        # loss calculation
+        loss = - self.targets_classes.mul(log_preds)
+        loss = loss.sum(dim=-1)
+        if self.reduction == 'mean':
+            loss = loss.mean()
+        return loss
+class MaskedBCEWithLogitsLoss(_MaskedLoss):
+    """Masked MSE loss"""
+    def __init__(self, pos_weight=None, weight=None, reduction='mean', ignore_nans=True, ignore_value=-100):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight, weight=weight, reduction='none')
+class MaskedCrossEntropyLoss(_MaskedLoss):
+    """Masked MSE loss"""
+    def __init__(self, weight=None, reduction='mean', ignore_nans=True, ignore_value=-100):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = nn.CrossEntropyLoss(weight=weight, reduction='none', ignore_index=ignore_value)

config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "alphabet": "gene_prot",
+  "architectures": [
+    "LucaGPLM"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "lucaone_gplm_config.LucaGPLMConfig",
+    "AutoModel": "lucaone_gplm.LucaGPLM"
+  },
+  "bos_token_id": 2,
+  "classifier_dropout": 0.0,
+  "classifier_dropout_prob": 0.0,
+  "classifier_hidden_act": "gelu",
+  "embed_scale": 1.0,
+  "eos_token_id": 3,
+  "gene_mask_classifier_output_size": 2048,
+  "gene_mask_label_num": 39,
+  "gene_taxonomy_classifier_output_size": 2048,
+  "gene_taxonomy_label_num": 735,
+  "gene_type_classifier_output_size": 128,
+  "gene_type_label_num": 8,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 2560,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "ignore_index": -100,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "mask_token_id": 4,
+  "max_position_embeddings": 1280,
+  "model_type": "lucagplm",
+  "no_position_embeddings": true,
+  "no_token_type_embeddings": false,
+  "num_attention_heads": 40,
+  "num_hidden_layers": 20,
+  "pad_token_id": 0,
+  "prot_contact_classifier_output_size": 3072,
+  "prot_domain_classifier_output_size": 10240,
+  "prot_domain_label_num": 13717,
+  "prot_homo_classifier_output_size": 4096,
+  "prot_homo_label_num": 3443,
+  "prot_keyword_classifier_output_size": 2048,
+  "prot_keyword_label_num": 1179,
+  "prot_mask_classifier_output_size": 2048,
+  "prot_mask_label_num": 39,
+  "prot_secondary_classifier_output_size": 3072,
+  "prot_site_classifier_output_size": 1024,
+  "prot_site_label_num": 946,
+  "prot_structure_classifier_output_size": 128,
+  "prot_structure_label_num": 3,
+  "prot_taxonomy_classifier_output_size": 2048,
+  "prot_taxonomy_label_num": 2196,
+  "sep_token_id": 3,
+  "token_dropout": false,
+  "torch_dtype": "float32",
+  "trans_classifier_output_size": 128,
+  "transformers_version": "4.29.0",
+  "type_vocab_size": 2,
+  "unk_token_id": 1,
+  "use_embed_layer_norm": false,
+  "use_last_layer_norm": true,
+  "vocab_size": 39
+}

file_operator.py ADDED Viewed

	@@ -0,0 +1,230 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import csv,sys
+import io, textwrap, itertools
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+csv.field_size_limit(sys.maxsize)
+common_nucleotide_set = {'A', 'T', 'C', 'G', 'U', 'N'}
+# not {'O', 'U', 'Z', 'J', 'B'}
+# Common amino acids
+common_amino_acid_set = {'R', 'X', 'S', 'G', 'W', 'I', 'Q', 'A', 'T', 'V', 'K', 'Y', 'C', 'N', 'L', 'F', 'D', 'M', 'P', 'H', 'E'}
+def clean_seq(protein_id, seq):
+    seq = seq.upper()
+    new_seq = ""
+    has_invalid_char = False
+    invalid_char_set = set()
+    for ch in seq:
+        if 'A' <= ch <= 'Z' and ch not in ['J']:
+            new_seq += ch
+        else:
+            invalid_char_set.add(ch)
+            has_invalid_char = True
+    if has_invalid_char:
+        print("id: %s. Seq: %s" % (protein_id, seq))
+        print("invalid char set:", invalid_char_set)
+    return new_seq
+def file_reader(filename, header=True, header_filter=True):
+    if filename.endswith(".fa") or filename.endswith(".fas") or filename.endswith(".fasta"):
+        return fasta_reader(filename)
+    elif filename.endswith(".csv"):
+        return csv_reader(filename, header=True, header_filter=True)
+    elif filename.endswith(".tsv"):
+        return tsv_reader(filename, header=True, header_filter=True)
+    else:
+        return txt_reader(filename, header=header, header_filter=header_filter)
+def txt_reader(handle, header=True, header_filter=True):
+    '''
+    csv 读取器，适合大文件
+    :param handle:
+    :param header:
+    :param header_filter: 返回结果是否去掉头
+    :return:
+    '''
+    handle = handle if isinstance(handle, io.TextIOWrapper) else open(handle, 'r')
+    try:
+        cnt = 0
+        for line in handle:
+            cnt += 1
+            if header and header_filter and cnt == 1:
+                continue
+            yield line.strip()
+    except Exception as e:
+        raise StopIteration
+    finally:
+        if not handle.closed:
+            handle.close()
+def tsv_reader(handle, header=True, header_filter=True):
+    '''
+    csv 读取器，适合大文件
+    :param handle:
+    :param header:
+    :param header_filter: 返回结果是否去掉头
+    :return:
+    '''
+    handle = handle if isinstance(handle, io.TextIOWrapper) else open(handle, 'r')
+    try:
+        reader = csv.reader(handle, delimiter="\t")
+        cnt = 0
+        for row in reader:
+            cnt += 1
+            if header and header_filter and cnt == 1:
+                continue
+            yield row
+    except Exception as e:
+        raise StopIteration
+    finally:
+        if not handle.closed:
+            handle.close()
+def csv_reader(handle, header=True, header_filter=True):
+    '''
+    csv 读取器，适合大文件
+    :param handle:
+    :param header:
+    :param header_filter: 返回结果是否去掉头
+    :return:
+    '''
+    handle = handle if isinstance(handle, io.TextIOWrapper) else open(handle, 'r')
+    try:
+        # data = csv.reader((line.replace('\0','') for line in data_initial), delimiter=",")
+        # reader = csv.reader(handle)
+        reader = csv.reader((line.replace('\0', '') for line in handle))
+        cnt = 0
+        for row in reader:
+            cnt += 1
+            if header and header_filter and cnt == 1:
+                continue
+            yield row
+    except Exception as e:
+        raise StopIteration
+    finally:
+        if not handle.closed:
+            handle.close()
+def txt_writer(dataset, handle, header=None):
+    '''
+    txt 写
+    :param dataset: 数据
+    :param handle: 文件
+    :param header: 头
+    :return:
+    '''
+    '''
+    handle = handle if isinstance(handle, io.TextIOWrapper) else open(handle, 'w')
+    try:
+        if header:
+            if isinstance(header, list):
+                handle.write(",".join(header) + "\n")
+            else:
+                handle.write(header + "\n")
+        print("header: %s" %header)
+        for row in dataset:
+            handle.write(str(row) + "\n")
+    except Exception as e:
+        raise e
+    finally:
+        if not handle.closed:
+            handle.close()
+    '''
+    with open(handle, "w") as wfp:
+        if header:
+            if isinstance(header, list):
+                wfp.write(",".join(header) + "\n")
+            else:
+                wfp.write(header + "\n")
+        for row in dataset:
+            wfp.write(str(row) + "\n")
+def csv_writer(dataset, handle, header):
+    '''
+    csv 写，适合大文件
+    :param dataset: 数据
+    :param handle: 文件
+    :param header: 头
+    :return:
+    '''
+    handle = handle if isinstance(handle, io.TextIOWrapper) else open(handle, 'w')
+    try:
+        writer = csv.writer(handle)
+        if header:
+            writer.writerow(header)
+        for row in dataset:
+            writer.writerow(row)
+    except Exception as e:
+        raise e
+    finally:
+        if not handle.closed:
+            handle.close()
+def fasta_reader(handle, width=None):
+    """
+    Reads a FASTA file, yielding header, sequence pairs for each sequence recovered 适合大文件
+    args:
+        :handle (str, pathliob.Path, or file pointer) - fasta to read from
+        :width (int or None) - formats the sequence to have max `width` character per line.
+                               If <= 0, processed as None. If None, there is no max width.
+    yields:
+        :(header, sequence) tuples
+    returns:
+        :None
+    """
+    FASTA_STOP_CODON = "*"
+    handle = handle if isinstance(handle, io.TextIOWrapper) else open(handle, 'r')
+    width = width if isinstance(width, int) and width > 0 else None
+    try:
+        header = None
+        for is_header, group in itertools.groupby(handle, lambda line: line.startswith(">")):
+            if is_header:
+                header = group.__next__().strip()
+            else:
+                seq = ''.join(line.strip() for line in group).strip().rstrip(FASTA_STOP_CODON)
+                if width is not None:
+                    seq = textwrap.fill(seq, width)
+                yield header, seq
+    except Exception as e:
+        raise StopIteration
+    finally:
+        if not handle.closed:
+            handle.close()
+def write_fasta(filepath, sequences):
+    '''
+    write fasta file
+    :param filepath: savepath
+    :param sequences: fasta sequence(each item: [id, seq])
+    :return:
+    '''
+    if sequences:
+        with open(filepath, "w") as output_handle:
+            if len(sequences[0]) > 1 and isinstance(sequences[0][0], str):
+                for row in sequences:
+                    protein_id = row[0]
+                    seq = row[1]
+                    sequence = SeqRecord(Seq(seq, None), id=protein_id[1:] if protein_id and protein_id[0] == ">" else protein_id, description="")
+                    SeqIO.write(sequence, output_handle, "fasta")
+            else:
+                for sequence in sequences:
+                    SeqIO.write(sequence, output_handle, "fasta")

loss.py ADDED Viewed

	@@ -0,0 +1,224 @@

+#!/usr/bin/env python
+# encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: [email protected]
+@tel: 137****6540
+@datetime: 2023/5/3 20:35
+@project: LucaOne
+@file: loss.py
+@desc: loss
+'''
+import torch, math
+import torch.nn as nn
+from .classification_loss import *
+from .regression_loss import *
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+def create_activate(activate_func):
+    if activate_func:
+        activate_func = activate_func.lower()
+    if activate_func == "tanh":
+        return nn.Tanh()
+    elif activate_func == "relu":
+        return nn.ReLU()
+    elif activate_func == "leakyrelu":
+        return nn.LeakyReLU()
+    elif activate_func == "gelu":
+        return nn.GELU()
+    elif activate_func == "gelu_new":
+        return NewGELUActivation()
+    else:
+        return nn.Tanh()
+def create_loss_function(config,
+                         args,
+                         task_level_type,
+                         task_level_name,
+                         sigmoid,
+                         output_mode,
+                         num_labels,
+                         loss_type,
+                         ignore_index=-100,
+                         pair_level=False,
+                         return_types=["dropout", "hidden_layer", "hidden_act", "classifier", "output", "loss"]
+                         ):
+    '''
+    create the output layer and loss layer
+    :param task_level_name:
+    :param task_level_type:
+    :param pair_level:
+    :param config:
+    :param args:
+    :param sigmoid:
+    :param output_mode:
+    :param num_labels:
+    :param loss_type:
+    :param ignore_index:
+    :param return_types:
+    :return:
+    '''
+    dropout, hidden_layer, hidden_act, classifier, output, loss_fct = None, None, None, None, None, None
+    if "dropout" in return_types:
+        if hasattr(config, "classifier_dropout_prob"):
+            dropout = nn.Dropout(config.classifier_dropout_prob)
+        elif hasattr(config, "dropout_prob"):
+            dropout = nn.Dropout(config.dropout_prob)
+        else:
+            dropout = nn.Dropout(0.1)
+    if pair_level:
+        hidden_size = 2 * config.hidden_size
+    else:
+        hidden_size = config.hidden_size
+    if "hidden_layer" in return_types:
+        if isinstance(args.classifier_size, int):
+            hidden_layer_size = args.classifier_size
+        else:
+            hidden_layer_size = args.classifier_size[task_level_type][task_level_name]
+        hidden_layer = nn.Linear(hidden_size, hidden_layer_size, bias=True)
+        hidden_size = hidden_layer_size
+    if "hidden_act" in return_types:
+        if hasattr(args, "classifier_hidden_act"):
+            hidden_act = create_activate(args.classifier_hidden_act)
+        elif hasattr(config, "classifier_hidden_act"):
+            hidden_act = create_activate(config.classifier_hidden_act)
+    if "classifier" in return_types:
+        if sigmoid:
+            if output_mode in ["binary_class", "binary-class"]:
+                classifier = nn.Linear(hidden_size, 1, bias=True)
+            else:
+                classifier = nn.Linear(hidden_size, num_labels, bias=True)
+        else:
+            classifier = nn.Linear(hidden_size, num_labels, bias=True)
+    if "output" in return_types:
+        if sigmoid or output_mode in ["multi_label", "multi-label", "binary_class", "binary-class"]:
+            output = nn.Sigmoid()
+        elif output_mode in ["multi_class", "multi-class"]:
+            output = nn.Softmax(dim=-1)
+        else:
+            output = None
+    if "loss" in return_types:
+        # positive weight
+        if hasattr(args, "pos_weight") and args.pos_weight:
+            pos_weight = args.pos_weight
+        elif hasattr(config, "pos_weight") and config.pos_weight:
+            pos_weight = config.pos_weight
+        else:
+            pos_weight = None
+        if hasattr(args, "weight") and args.weight is not None:
+            weight = args.weight
+        elif hasattr(config, "weight") and config.weight is not None:
+            weight = config.weight
+        else:
+            weight = None
+        reduction = config.loss_reduction if hasattr(config, "loss_reduction") else "meanmean"
+        if output_mode in ["regression"]:
+            if loss_type == "l2":
+                loss_fct = MaskedMSELoss(reduction=reduction, ignore_nans=True,
+                                         ignore_value=ignore_index * 1.0 if ignore_index else None)
+            elif loss_type == "l1":
+                loss_fct = MaskedL1Loss(reduction=reduction, ignore_nans=True,
+                                        ignore_value=ignore_index * 1.0 if ignore_index else None)
+        elif output_mode in ["multi_label", "multi-label"]:
+            if loss_type == "bce":
+                if pos_weight:
+                    if isinstance(pos_weight, str) or isinstance(pos_weight, int):
+                        pos_weight = [float(pos_weight)] * num_labels
+                    elif isinstance(pos_weight, float):
+                        pos_weight = [pos_weight] * num_labels
+                    pos_weight = torch.tensor(pos_weight, dtype=torch.float32).to(args.device)
+                    print("multi_label pos_weight:")
+                    print(pos_weight)
+                    assert pos_weight.ndim == 1 and pos_weight.shape[0] == num_labels
+                    print("multi_label reduction:")
+                    print(reduction)
+                    loss_fct = MaskedBCEWithLogitsLoss(pos_weight=pos_weight, reduction=reduction,
+                                                       ignore_nans=True, ignore_value=ignore_index)
+                else:
+                    loss_fct = MaskedBCEWithLogitsLoss(reduction=reduction,
+                                                       ignore_nans=True, ignore_value=ignore_index)
+            elif loss_type == "asl":
+                loss_fct = MaskedAsymmetricLossOptimized(gamma_neg=args.asl_gamma_neg if hasattr(args, "asl_gamma_neg") else 4.0,
+                                                         gamma_pos=args.asl_gamma_pos if hasattr(args, "asl_gamma_pos") else 1.0,
+                                                         clip=args.clip if hasattr(args, "clip") else 0.05,
+                                                         eps=args.eps if hasattr(args, "eps") else 1e-8,
+                                                         disable_torch_grad_focal_loss=args.disable_torch_grad_focal_loss if hasattr(args, "disable_torch_grad_focal_loss") else False,
+                                                         reduction=reduction,
+                                                         ignore_nans=True,
+                                                         ignore_value=ignore_index)
+            elif loss_type == "focal_loss":
+                loss_fct = MaskedFocalLoss(alpha=args.focal_loss_alpha if hasattr(args, "focal_loss_alpha") else 0.7,
+                                           gamma=args.focal_loss_gamma if hasattr(args, "focal_loss_gamma") else 2.0,
+                                           normalization=True,
+                                           reduction=reduction,
+                                           ignore_nans=True,
+                                           ignore_value=ignore_index)
+            elif loss_type == "multilabel_cce":
+                loss_fct = MaskedMultiLabelCCE(normalization=True,
+                                               reduction=reduction,
+                                               ignore_nans=True,
+                                               ignore_value=ignore_index)
+        elif output_mode in ["binary_class", "binary-class"]:
+            if loss_type == "bce":
+                if pos_weight:
+                    if isinstance(pos_weight, int) or isinstance(pos_weight, str):
+                        pos_weight = torch.tensor([float(pos_weight)], dtype=torch.float32).to(args.device)
+                    elif isinstance(pos_weight, float):
+                        pos_weight = torch.tensor([pos_weight], dtype=torch.float32).to(args.device)
+                    print("binary_class pos_weight:")
+                    print(pos_weight)
+                    assert pos_weight.ndim == 1 and pos_weight.shape[0] == 1
+                    loss_fct = MaskedBCEWithLogitsLoss(pos_weight=pos_weight, reduction=reduction, ignore_nans=True,
+                                                       ignore_value=ignore_index)
+                else:
+                    loss_fct = MaskedBCEWithLogitsLoss(reduction=reduction, ignore_nans=True, ignore_value=ignore_index)
+            elif loss_type == "focal_loss":
+                loss_fct = MaskedFocalLoss(alpha=args.focal_loss_alpha if hasattr(args, "focal_loss_alpha") else 0.7,
+                                           gamma=args.focal_loss_gamma if hasattr(args, "focal_loss_gamma") else 2.0,
+                                           normalization=True,
+                                           reduction=reduction,
+                                           ignore_nans=True,
+                                           ignore_value=ignore_index)
+        elif output_mode in ["multi_class", "multi-class"]:
+            if weight:
+                # [1, 1, 1, ,1, 1...] length: num_labels
+                if isinstance(weight, str) or isinstance(weight, int):
+                    weight = [float(weight)] * num_labels
+                if isinstance(weight, float):
+                    weight = [weight] * num_labels
+                weight = torch.tensor(weight, dtype=torch.float32).to(args.device)
+                print("multi_class weight:")
+                print(weight)
+                assert weight.ndim == 1 and weight.shape[0] == num_labels
+                if ignore_index is None:
+                    loss_fct = nn.CrossEntropyLoss(weight=weight, reduction=reduction)
+                else:
+                    loss_fct = MaskedCrossEntropyLoss(weight=weight, reduction=reduction, ignore_nans=True, ignore_value=ignore_index)
+            else:
+                if ignore_index is None:
+                    loss_fct = nn.CrossEntropyLoss(reduction=reduction)
+                else:
+                    loss_fct = MaskedCrossEntropyLoss(reduction=reduction, ignore_nans=True, ignore_value=ignore_index)
+        else:
+            raise Exception("Not support output mode: %s." % output_mode)
+    return dropout, hidden_layer, hidden_act, classifier, output, loss_fct

lucaone_gplm.py ADDED Viewed

	@@ -0,0 +1,572 @@

+#!/usr/bin/env python
+# encoding: utf-8
+from .loss import *
+from .model_utils import AllOutput, create_output_loss_lucagplm
+from .alphabet import Alphabet
+from .modeling_gplm import *
+from .lucaone_gplm_config import LucaGPLMConfig
+from transformers import PreTrainedModel
+class LucaGPLM(PreTrainedModel):
+    config_class = LucaGPLMConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.max_position_embeddings = config.max_position_embeddings
+        self.type_vocab_size = config.type_vocab_size
+        self.num_layers = config.num_hidden_layers
+        self.embed_dim = config.hidden_size
+        self.attention_heads = config.num_attention_heads
+        self.no_position_embeddings = config.no_position_embeddings
+        self.no_token_type_embeddings = config.no_token_type_embeddings
+        if not isinstance(config.alphabet, Alphabet):
+            self.alphabet = Alphabet.from_predefined(config.alphabet)
+        else:
+            self.alphabet = config.alphabet
+        self.alphabet_size = len(self.alphabet)
+        self.padding_idx = self.alphabet.padding_idx
+        self.mask_idx = self.alphabet.mask_idx
+        self.cls_idx = self.alphabet.cls_idx
+        self.eos_idx = self.alphabet.eos_idx
+        self.prepend_bos = self.alphabet.prepend_bos
+        self.append_eos = self.alphabet.append_eos
+        self.token_dropout = config.token_dropout
+        self.ignore_index = config.ignore_index
+        self.use_embed_layer_norm = config.use_embed_layer_norm
+        self.use_last_layer_norm = config.use_last_layer_norm
+        self.embed_scale = config.embed_scale
+        self._init_submodules()
+    def _init_submodules(self):
+        # normal_(0, 1)
+        self.embed_tokens = nn.Embedding(
+            self.alphabet_size,
+            self.embed_dim,
+            padding_idx=self.padding_idx,
+        )
+        self.embed_pos = None
+        if not self.no_position_embeddings:
+            self.embed_pos = nn.Embedding(self.max_position_embeddings, self.embed_dim)
+        self.embed_type = None
+        if not self.no_token_type_embeddings:
+            self.embed_type = nn.Embedding(self.type_vocab_size, self.embed_dim)
+        if self.use_embed_layer_norm:
+            self.embed_layer_norm = LucaGPLM1bLayerNorm(self.embed_dim)
+        else:
+            self.embed_layer_norm = None
+        self.layers = nn.ModuleList(
+            [
+                LucaGPLMTransformerLayer(
+                    self.embed_dim,
+                    4 * self.embed_dim,
+                    self.attention_heads,
+                    add_bias_kv=False,
+                    use_lucagplm1b_layer_norm=True,
+                    use_rotary_embeddings=True,
+                    )
+                for _ in range(self.num_layers)
+            ]
+        )
+        self.layer_size = len(self.layers)
+        self.contact_head = ContactPredictionHead(
+            self.num_layers * self.attention_heads,
+            self.prepend_bos,
+            self.append_eos,
+            eos_idx=self.eos_idx,
+            )
+        if self.use_last_layer_norm:
+            self.last_layer_norm = LucaGPLM1bLayerNorm(self.embed_dim)
+        else:
+            self.last_layer_norm = None
+        self.lm_head = RobertaLMHead(
+            embed_dim=self.embed_dim,
+            output_dim=self.alphabet_size,
+            weight=self.embed_tokens.weight,
+        )
+    def _init_embedding(self, pretrained_token_matrix, token_matrix):
+        '''
+        0->2
+        1->0
+        2->3
+        3->1
+        4->10
+        ...
+        28->34
+        29->36
+        30->37
+        31->38
+        32->4
+        '''
+        print("Load pretrained exsists embedding vectors:")
+        token_matrix[2, :] = pretrained_token_matrix[0, :]
+        token_matrix[0, :] = pretrained_token_matrix[1, :]
+        token_matrix[3, :] = pretrained_token_matrix[2, :]
+        token_matrix[1, :] = pretrained_token_matrix[3, :]
+        for idx in range(10, 35):
+            token_matrix[idx, :] = pretrained_token_matrix[idx - 6, :]
+        token_matrix[36, :] = pretrained_token_matrix[29, :]
+        token_matrix[37, :] = pretrained_token_matrix[30, :]
+        token_matrix[38, :] = pretrained_token_matrix[31, :]
+        token_matrix[4, :] = pretrained_token_matrix[32, :]
+        return token_matrix
+    def _init_submodules_new(self, pretrained_model_name):
+        print("Load pretrained model exists weights:")
+        from esm import pretrained
+        from collections import OrderedDict
+        pretrained, _ = pretrained.load_model_and_alphabet(pretrained_model_name)
+        pretrained_state_dict = pretrained.state_dict()
+        new_state_dict = OrderedDict()
+        our_model_state_dict = {}
+        for key, value in self.state_dict().items():
+            our_model_state_dict[key] = value
+        for name, weight in pretrained_state_dict.items():
+            if "final_layer_norm" in name:
+                name = name.replace("final_layer_norm", "post_layer_norm")
+            elif "self_attn_layer_norm" in name:
+                name = name.replace("self_attn_layer_norm", "pre_layer_norm")
+            elif "emb_layer_norm_after" in name:
+                name = name.replace("emb_layer_norm_after", "last_layer_norm")
+            if name.startswith("layers."):
+                layer_id = name.split(".")[1]
+                if int(layer_id) >= self.num_layers:
+                    continue
+            if name == "embed_tokens.weight":
+                new_state_dict[name] = self._init_embedding(weight, our_model_state_dict[name])
+                del our_model_state_dict[name]
+            elif name in our_model_state_dict and our_model_state_dict[name].shape == weight.shape:
+                del our_model_state_dict[name]
+                new_state_dict[name] = weight
+        print("Exists layer names:")
+        print(new_state_dict.keys())
+        print("Not exists Layer names:")
+        print(our_model_state_dict.keys())
+        new_state_dict.update(our_model_state_dict)
+        self.load_state_dict(new_state_dict)
+    def __calc_loss__(self, task_level_type, output_mode, logits, label, label_size, loss_fct, loss_reduction):
+        '''
+        if label_size <= 2 or output_mode in ["binary_class", "binary-class"]:
+            loss = loss_fct(logits.view(-1), label.view(-1).float())
+        elif output_mode in ["multi_label", "multi-label"]:
+            loss = loss_fct(logits.view(-1, label_size), label.view(-1, label_size).float())
+        elif output_mode in ["multi_class", "multi-class"]:
+            loss = loss_fct(logits.view(-1, label_size), label.view(-1))
+        else:
+            loss = loss_fct(logits.view(-1), label.view(-1))
+        return loss
+        '''
+        '''
+        print(task_level_type, output_mode, label_size, loss_fct, loss_reduction)
+        print("logits:")
+        print(logits.shape)
+        print("label:")
+        print(label.shape)
+        '''
+        if output_mode in ["regression"]:
+            if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
+                # structure-level regression
+                # logits: N, seq_len, 3
+                # label: N, seq_len, 3
+                loss = loss_fct(logits, label)
+            else:
+                # structure-level regression
+                # logits: N * seq_len * 3
+                # label: N * seq_len * 3
+                loss = loss_fct(logits.view(-1), label.view(-1))
+        elif output_mode in ["multi_label", "multi-label"]:
+            # only for seq-level
+            if loss_reduction == "meanmean":
+                # logits: N , label_size
+                # label: N , label_size
+                loss = loss_fct(logits, label.float())
+            else:
+                # logits: N , label_size
+                # label: N , label_size
+                loss = loss_fct(logits.view(-1, label_size), label.view(-1, label_size).float())
+        elif label_size <= 2 or output_mode in ["binary_class", "binary-class"]:
+            if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
+                # token-level & meanmean
+                # logits: N ,seq_len, 1
+                # label: N, seq_len
+                loss = loss_fct(logits, label.float())
+            else:
+                # seq-level || token-level
+                # logits: N
+                # label: N
+                loss = loss_fct(logits.view(-1), label.view(-1).float())
+        elif output_mode in ["multi_class", "multi-class"]:
+            if task_level_type not in ["seq_level"] and loss_reduction == "meanmean":
+                # token-level
+                # logits: N ,seq_len, label_size
+                # label: N , seq_len
+                loss = loss_fct(logits, label)
+            else:
+                # token-level
+                # logits: N * seq_len, label_size
+                # label: N * seq_len
+                # seq-level
+                # logits: N, label_size
+                # label: N
+                loss = loss_fct(logits.view(-1, label_size), label.view(-1))
+        else:
+            raise Exception("Not support output_mode=%s" % output_mode)
+        return loss
+    def __forword__(self,
+                    input_ids: Optional[torch.Tensor] = None,
+                    attention_mask: Optional[torch.Tensor] = None,
+                    token_type_ids: Optional[torch.Tensor] = None,
+                    position_ids: Optional[torch.Tensor] = None,
+                    output_keys: Optional[dict[str, set[str]]] = None,
+                    labels: Optional[dict[str, dict[str, torch.Tensor]]] = None,
+                    repr_layers=[-1],
+                    need_head_weights=False,
+                    return_contacts=False,
+                    use_last_layer_norm=True):
+        assert all(-(self.layer_size + 1) <= i <= self.layer_size for i in repr_layers)
+        repr_layers = [(i + self.layer_size + 1) % (self.layer_size + 1) for i in repr_layers]
+        if return_contacts:
+            need_head_weights = True
+        assert input_ids.ndim == 2
+        # 动态求mask，(B * Seq_len) 被mask掉位置的值为True
+        if attention_mask is None:
+            padding_mask = input_ids.eq(self.padding_idx)
+        else:
+            padding_mask = attention_mask.eq(self.padding_idx)
+        x = self.embed_scale * self.embed_tokens(input_ids)
+        if self.embed_pos is not None and position_ids is not None:
+            x += self.embed_scale * self.embed_pos(position_ids)
+        if self.embed_type is not None and token_type_ids is not None:
+            x += self.embed_scale * self.embed_type(token_type_ids)
+        if self.embed_layer_norm is not None:
+            x = self.embed_layer_norm(x)
+        # Token dropout
+        if self.token_dropout:
+            x.masked_fill_((input_ids == self.mask_idx).unsqueeze(-1), 0.0)
+            # x: B x L x C
+            mask_ratio_train = 0.15 * 0.8
+            src_lengths = (~padding_mask).sum(-1)
+            mask_ratio_observed = (input_ids == self.mask_idx).sum(-1).to(x.dtype) / src_lengths
+            x = x * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]
+        # Mask 操作
+        if padding_mask is not None:
+            x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))
+        # 返回值包括哪些
+        repr_layers = set(repr_layers)
+        hidden_representations = {}
+        # 0:embedding
+        if 0 in repr_layers:
+            hidden_representations[0] = x
+        # 是否需要返回head weights
+        if need_head_weights:
+            attn_weights = []
+        # (B, L, E) => (L, B, E)
+        x = x.transpose(0, 1)
+        if not padding_mask.any():
+            padding_mask = None
+        for layer_idx, layer in enumerate(self.layers):
+            x, attn = layer(
+                x,
+                self_attn_padding_mask=padding_mask,
+                need_head_weights=need_head_weights,
+            )
+            if (layer_idx + 1) in repr_layers:
+                hidden_representations[layer_idx + 1] = x.transpose(0, 1)
+            if need_head_weights:
+                # (H, B, L, L) => (B, H, L, L)
+                attn_weights.append(attn.transpose(1, 0))
+        # (L, B, E)
+        if self.last_layer_norm is not None and use_last_layer_norm:
+            # 最后一层隐含层 加一层layernorm
+            x = self.last_layer_norm(x)
+        x = x.transpose(0, 1)  # (L, B, E) => (B, L,  E)
+        # last hidden representation should have layer norm applied
+        if (layer_idx + 1) in repr_layers:
+            hidden_representations[layer_idx + 1] = x
+        # 最后一层作为表征矩阵
+        # (B, L, E)
+        representation_matrix = hidden_representations[self.layer_size]
+        # mask 任务
+        # B * Seq_len * vocab_size
+        lm_mask_logits = self.lm_head(x)
+        # lm head的输出向量作为表征向量
+        # (B, E)
+        representation_vector = representation_matrix[:, 0, :]
+        logits = {}
+        losses = {}
+        outputs = {}
+        representations = {
+            "representation_matrix": representation_matrix,
+            "representation_vector": representation_vector
+        }
+        # 每一层的attention值
+        if need_head_weights:
+            # attentions: B x Layers x H x L x L
+            attentions = torch.stack(attn_weights, 1)
+            if padding_mask is not None:
+                attention_mask = 1 - padding_mask.type_as(attentions)
+                attention_mask = attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2)
+                attentions = attentions * attention_mask[:, None, None, :, :]
+            representations["attentions"] = attentions
+            # 预测contact矩阵
+            if return_contacts:
+                contacts = self.contact_head(input_ids, attentions)
+                representations["contacts"] = contacts
+        '''
+        print("output_keys:")
+        print(output_keys)
+        '''
+        if output_keys:
+            for item in output_keys.items():
+                cur_task_level_type = item[0]
+                if cur_task_level_type not in logits:
+                    logits[cur_task_level_type] = {}
+                    outputs[cur_task_level_type] = {}
+                for cur_task_level_name in item[1]:
+                    if cur_task_level_type == "token_level":
+                        cur_logits = lm_mask_logits
+                    elif cur_task_level_type == "seq_level":
+                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](representation_vector)
+                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
+                        if cur_hidden_layer is not None:
+                            cur_logits = cur_hidden_layer(cur_logits)
+                        cur_hidden_act = self.hidden_act[cur_task_level_type][cur_task_level_name]
+                        if cur_hidden_act is not None:
+                            cur_logits = cur_hidden_act(cur_logits)
+                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
+                    elif cur_task_level_type == "span_level":
+                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](representation_matrix)
+                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
+                        if cur_hidden_layer is not None:
+                            cur_logits = cur_hidden_layer(cur_logits)
+                        cur_hidden_act = self.hidden_act[cur_task_level_type][cur_task_level_name]
+                        if cur_hidden_act is not None:
+                            cur_logits = cur_hidden_act(cur_logits)
+                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
+                    elif cur_task_level_type == "structure_level":
+                        cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](representation_matrix)
+                        cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
+                        if cur_hidden_layer is not None:
+                            cur_logits = cur_hidden_layer(cur_logits)
+                        cur_hidden_act = self.hidden_act[cur_task_level_type][cur_task_level_name]
+                        if cur_hidden_act is not None:
+                            cur_logits = cur_hidden_act(cur_logits)
+                        cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
+                    logits[cur_task_level_type][cur_task_level_name] = cur_logits
+                    if cur_task_level_type in self.output and cur_task_level_name in self.output[cur_task_level_type] \
+                            and self.output[cur_task_level_type][cur_task_level_name] is not None:
+                        outputs[cur_task_level_type][cur_task_level_name] = self.output[cur_task_level_type][cur_task_level_name](cur_logits)
+                    else:
+                        outputs[cur_task_level_type][cur_task_level_name] = cur_logits
+                    if labels is not None and cur_task_level_type in labels and cur_task_level_name in labels[cur_task_level_type]:
+                        if cur_task_level_type not in losses:
+                            losses[cur_task_level_type] = {}
+                        cur_label = labels[cur_task_level_type][cur_task_level_name]
+                        cur_label_size = self.label_size[cur_task_level_type][cur_task_level_name]
+                        cur_output_mode = self.output_mode[cur_task_level_type][cur_task_level_name]
+                        cur_loss_fct = self.loss_fct[cur_task_level_type][cur_task_level_name]
+                        cur_loss = self.__calc_loss__(
+                                                      task_level_type=cur_task_level_type,
+                                                      output_mode=cur_output_mode,
+                                                      logits=cur_logits,
+                                                      label=cur_label,
+                                                      label_size=cur_label_size,
+                                                      loss_fct=cur_loss_fct,
+                                                      loss_reduction="meanmean")
+                        losses[cur_task_level_type][cur_task_level_name] = cur_loss
+        return representations, logits, outputs, losses
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            global_attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            output_keys: Optional[dict[str, set[str]]] = None,
+            labels: Optional[dict[str, dict[str, torch.Tensor]]] = None,
+            input_ids_b: Optional[torch.Tensor] = None,
+            attention_mask_b: Optional[torch.Tensor] = None,
+            global_attention_mask_b: Optional[torch.Tensor] = None,
+            token_type_ids_b: Optional[torch.Tensor] = None,
+            position_ids_b: Optional[torch.Tensor] = None,
+            head_mask_b: Optional[torch.Tensor] = None,
+            inputs_embeds_b: Optional[torch.Tensor] = None,
+            output_keys_b: Optional[dict[str, set[str]]] = None,
+            labels_b: Optional[dict[str, dict[str, torch.Tensor]]] = None,
+            pair_label: Optional[dict[str, dict[str, torch.Tensor]]] = None,
+            pair_output_keys: Optional[dict[str, set[str]]] = None,
+            output_hidden_states: Optional[dict[str, set[str]]] = None,
+            output_attentions: Optional[dict[str, set[str]]] = None,
+            need_head_weights: Optional[bool] = None,
+            return_contacts: Optional[bool] = None,
+            repr_layers: Optional[list[int]] = None,
+            return_dict: Optional[bool] = None,
+            use_last_layer_norm: Optional[bool] = True
+    ) -> Union[Tuple[torch.Tensor], AllOutput]:
+        if return_dict is None and self.config is not None:
+            return_dict = self.config.use_return_dict
+        if return_dict is None:
+            return_dict = False
+        if repr_layers is None or len(repr_layers) == 0:
+            repr_layers = [-1]
+        if return_contacts is None:
+            return_contacts = False
+        if need_head_weights is None:
+            need_head_weights = True
+        has_pair = False
+        has_pair_b = False
+        if input_ids is not None or inputs_embeds is not None:
+            encoding, logits, outputs, losses = self.__forword__(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                output_keys=output_keys,
+                labels=labels,
+                repr_layers=repr_layers,
+                need_head_weights=need_head_weights,
+                return_contacts=return_contacts,
+                use_last_layer_norm=use_last_layer_norm
+            )
+            has_pair = True
+        if input_ids_b is not None or inputs_embeds_b is not None:
+            encoding_b, logits_b, outputs_b, losses_b = self.__forword__(
+                input_ids=input_ids_b,
+                attention_mask=attention_mask_b,
+                token_type_ids=token_type_ids_b,
+                position_ids=position_ids_b,
+                output_keys=output_keys_b,
+                labels=labels_b,
+                repr_layers=repr_layers,
+                need_head_weights=need_head_weights,
+                return_contacts=return_contacts,
+                use_last_layer_norm=use_last_layer_norm
+            )
+            has_pair_b = True
+        if has_pair and has_pair_b and pair_output_keys and len(pair_output_keys) > 0:
+            cur_representation_vector = encoding["representation_vector"]
+            cur_representation_vector_b = encoding_b["representation_vector"]
+            pair_logits = {}
+            pair_outputs = {}
+            for item1 in pair_output_keys.items():
+                cur_task_level_type = item1[0]
+                if cur_task_level_type not in pair_outputs:
+                    pair_outputs[cur_task_level_type] = {}
+                    pair_logits[cur_task_level_type] = {}
+                for cur_task_level_name in item1[1]:
+                    cur_logits = self.classifier_dropout[cur_task_level_type][cur_task_level_name](
+                        torch.cat((cur_representation_vector, cur_representation_vector_b), dim=-1)
+                    )
+                    cur_hidden_layer = self.hidden_layer[cur_task_level_type][cur_task_level_name]
+                    if cur_hidden_layer is not None:
+                        cur_logits = cur_hidden_layer(cur_logits)
+                    cur_logits = self.classifier[cur_task_level_type][cur_task_level_name](cur_logits)
+                    pair_logits[cur_task_level_type][cur_task_level_name] = cur_logits
+                    pair_outputs[cur_task_level_type][cur_task_level_name] = self.output[cur_task_level_type][cur_task_level_name](cur_logits)
+            if pair_label is not None:
+                pair_loss = {}
+                for item1 in pair_output_keys.items():
+                    cur_task_level_type = item1[0]
+                    if cur_task_level_type not in pair_label:
+                        continue
+                    if cur_task_level_type in pair_label:
+                        pair_loss[cur_task_level_type] = {}
+                    for cur_task_level_name in item1[1]:
+                        if cur_task_level_name not in pair_label[cur_task_level_type]:
+                            continue
+                        cur_label = pair_label[cur_task_level_type][cur_task_level_name]
+                        cur_label_size = self.label_size[cur_task_level_type][cur_task_level_name]
+                        cur_output_mode = self.output_mode[cur_task_level_type][cur_task_level_name]
+                        cur_loss_fct = self.loss_fct[cur_task_level_type][cur_task_level_name]
+                        cur_logits = pair_logits[cur_task_level_type][cur_task_level_name]
+                        cur_loss = self.__calc_loss__(
+                            task_level_type=cur_task_level_type,
+                            output_mode=cur_output_mode, logits=cur_logits,
+                            label=cur_label, label_size=cur_label_size, loss_fct=cur_loss_fct,
+                            loss_reduction="meanmean")
+                        pair_loss[cur_task_level_type][cur_task_level_name] = cur_loss
+                if not return_dict:
+                    return [[losses, losses_b, pair_loss], [outputs, outputs_b, pair_outputs]] + [[encoding, encoding_b]]
+                return AllOutput(
+                    losses=losses,
+                    outputs=outputs,
+                    hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
+                    attentions=encoding["attentions"] if "attentions" in encoding else None,
+                    global_attentions=None,
+                    contacts=encoding["contacts"] if "contacts" in encoding else None,
+                    losses_b=losses_b,
+                    outputs_b=outputs_b,
+                    hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
+                    attentions_b=encoding_b["attentions"] if "hidden_states" in encoding_b else None,
+                    global_attentions_b=None,
+                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None,
+                    pair_outputs=pair_outputs,
+                    pair_losses=pair_loss)
+            else:
+                if not return_dict:
+                    return [[losses, losses_b], [outputs, outputs_b]] + [[encoding, encoding_b]]
+                return AllOutput(
+                    losses=losses,
+                    outputs=outputs,
+                    hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
+                    attentions=encoding["attentions"] if "attentions" in encoding else None,
+                    global_attentions=None,
+                    contacts=encoding["contacts"] if "contacts" in encoding else None,
+                    losses_b=losses_b,
+                    outputs_b=outputs_b,
+                    hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
+                    attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
+                    global_attentions_b=None,
+                    contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
+                )
+        elif has_pair:
+            if not return_dict:
+                return [[losses], [outputs], [encoding]]
+            return AllOutput(
+                losses=losses,
+                outputs=outputs,
+                hidden_states=encoding["representation_matrix"] if "representation_matrix" in encoding else None,
+                attentions=encoding["attentions"] if "attentions" in encoding else None,
+                global_attentions=None,
+                contacts=encoding["contacts"] if "contacts" in encoding else None
+            )
+        else:
+            if not return_dict:
+                return [[losses_b], [outputs_b], [encoding_b]]
+            return AllOutput(
+                losses_b=losses_b,
+                outputs_b=outputs_b,
+                hidden_states_b=encoding_b["representation_matrix"] if "representation_matrix" in encoding_b else None,
+                attentions_b=encoding_b["attentions"] if "attentions" in encoding_b else None,
+                global_attentions_b=None,
+                contacts_b=encoding_b["contacts"] if "contacts" in encoding_b else None
+            )
+    def predict_contacts(self, input_ids, position_ids=None, token_type_ids=None):
+        return self(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, return_contacts=True)["contacts"]

lucaone_gplm_config.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python
+# encoding: utf-8
+from transformers import PretrainedConfig
+class LucaGPLMConfig(PretrainedConfig):
+    model_type = "lucagplm"
+    def __init__(
+        self,
+        vocab_size=-1,
+        pad_token_id=0,
+        max_position_embeddings: int = 4096,
+        type_vocab_size: int = 2,
+        num_hidden_layers: int = 24,
+        hidden_size: int = 1280,
+        num_attention_heads: int = 20,
+        no_position_embeddings: bool = False,
+        no_token_type_embeddings: bool = False,
+        alphabet: str = "gene_prot",
+        token_dropout: bool = True,
+        attention_probs_dropout_prob=0.1,
+        hidden_dropout_prob=0.1,
+        classifier_dropout_prob=0.1,
+        use_embed_layer_norm=True,
+        use_last_layer_norm=True,
+        embed_scale=1.0,
+        ignore_index=-100,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.alphabet = alphabet
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.no_token_type_embeddings = no_token_type_embeddings
+        self.no_position_embeddings = no_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.token_dropout = token_dropout
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.classifier_dropout_prob = classifier_dropout_prob
+        self.ignore_index = ignore_index
+        self.use_embed_layer_norm = use_embed_layer_norm
+        self.use_last_layer_norm = use_last_layer_norm
+        self.embed_scale = embed_scale

masked_loss.py ADDED Viewed

	@@ -0,0 +1,159 @@

+#!/usr/bin/env python
+# encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: [email protected]
+@tel: 137****6540
+@datetime: 2023/6/28 10:25
+@project: LucaOne
+@file: masked_loss.py
+@desc: masked loss
+'''
+import warnings
+import torch
+import torch.nn as nn
+class _MaskedLoss(nn.Module):
+    """Base class for masked losses"""
+    def __init__(self, reduction='mean', ignore_nans=True, ignore_value=-100.0):
+        super().__init__()
+        self.reduction = reduction
+        self.ignore_nans = ignore_nans
+        self.ignore_value = ignore_value
+    def forward(self, pred, target, mask=None):
+        """Compute a loss between pred and target for given mask.
+        Note that this implementation is faster than loss(pred[mask], target[mask])
+        for a given loss, and is nan-proof."""
+        '''
+        if not (target.size() == pred.size()):
+            warnings.warn(
+                "Using a target size ({}) that is different to the pred size ({}). "
+                "This will likely lead to incorrect results due to broadcasting. "
+                "Please ensure they have the same size.".format(
+                    target.size(), pred.size()),
+                stacklevel=2,
+            )
+        '''
+        if mask is None and self.ignore_value is not None:
+            mask = target != self.ignore_value
+        elif mask is None:
+            mask = torch.ones_like(target, dtype=bool)
+        target_proxy = target
+        if self.ignore_nans:
+            target_proxy = target.clone()
+            nans = torch.isnan(target)
+            if nans.any():
+                with torch.no_grad():
+                    mask = mask & ~nans
+                    target_proxy[nans] = 0
+        # full_loss = self.criterion(pred, target_proxy)
+        # print("mask shape")
+        # print(mask.shape)
+        if self.reduction == 'meanmean' and pred.ndim == 3 and pred.shape[-1] == 1:
+            # token-level binary classification
+            # pred: n , seq_len, 1 -> n * seq_len
+            # target: n, seq_len -> n * seq_len
+            full_loss = self.criterion(pred.view(-1), target_proxy.view(-1))
+            full_loss = torch.reshape(full_loss, (-1, pred.shape[1]))
+            # print("ok1")
+        elif self.reduction == 'meanmean' and pred.ndim == 3:
+            if target.ndim == 3:
+                # token-level regression
+                # pred: n , seq_len, label_size -> n * seq_len * label_size
+                # target: n, seq_len, label_size -> n * seq_len * label_size
+                full_loss = self.criterion(pred.view(-1), target_proxy.view(-1))
+                full_loss = torch.reshape(full_loss, (-1, pred.shape[1], pred.shape[-1]))
+                # print("ok21")
+            else:
+                # token-level multi classification
+                # pred: n , seq_len, label_size -> n * seq_len, label_size
+                # target: n, seq_len -> n * seq_len
+                full_loss = self.criterion(pred.view(-1, pred.shape[-1]), target_proxy.view(-1))
+                full_loss = torch.reshape(full_loss, (-1, pred.shape[1]))
+                # print("ok22")
+        elif self.reduction == 'meanmean' and pred.ndim == 2 and target.ndim == 2:
+            # seq-level multi label
+            # pred: n , label_size -> n * label_size
+            # target: n, label_size -> n * label_size
+            full_loss = self.criterion(pred.view(-1), target_proxy.view(-1))
+            full_loss = torch.reshape(full_loss, (-1, pred.shape[1]))
+            # print("ok3")
+        elif self.reduction == 'meanmean':
+            self.reduction = "mean"
+            full_loss = self.criterion(pred, target_proxy)
+            # print("ok4")
+        else:
+            full_loss = self.criterion(pred, target_proxy)
+            # print("ok5")
+        full_loss[~mask] = 0
+        '''
+        if not mask.any():
+            warnings.warn("Evaluation mask is False everywhere, this might lead to incorrect results.")
+            print(full_loss.sum(), mask.to(full_loss.dtype).sum())
+        '''
+        if self.reduction == 'none':
+            return full_loss
+        if self.reduction == 'sum':
+            return full_loss.sum()
+        if self.reduction == 'mean':
+            '''
+            print("mask:")
+            print(mask.to(full_loss.dtype).sum(dim=-1))
+            print(mask.to(full_loss.dtype).sum())
+            '''
+            return full_loss.sum() / (mask.to(full_loss.dtype).sum() + 1e-12)
+        if self.reduction == 'meanmean':
+            if mask.ndim == 3:
+                mask_sum = mask.to(full_loss.dtype).sum(dim=-1)
+                '''
+                print("mask:")
+                print(mask_sum)
+                '''
+                full_loss = full_loss.sum(dim=-1) / (mask_sum + 1e-12)
+                mask_sum = mask_sum.to(torch.bool).sum(dim=-1)
+                # print(mask_sum)
+                full_loss = full_loss.sum(dim=-1) / (mask_sum + 1e-12)
+                mask_sum = mask_sum.to(torch.bool).sum()
+                # print(mask_sum)
+                loss = full_loss.sum() / (mask_sum + 1e-12)
+            else:
+                mask_sum = mask.to(full_loss.dtype).sum(dim=-1)
+                '''
+                print("mask:")
+                print(mask_sum)
+                print(mask_sum.to(torch.bool).sum())
+                '''
+                loss = torch.sum(full_loss.sum(dim=-1) / (mask_sum + 1e-12)) / (mask_sum.to(torch.bool).sum() + 1e-12)
+            # print(full_loss.sum() / (mask.to(full_loss.dtype).sum() + 1e-12), loss)
+            return loss
+        if self.reduction in ["summean", "meansum"]:
+            if mask.ndim == 3:
+                mask_sum = mask.to(full_loss.dtype).sum(dim=-1)
+                '''
+                print("mask:")
+                print(mask_sum)
+                '''
+                full_loss = full_loss.sum(dim=-1)
+                mask_sum = mask_sum.to(torch.bool).sum(dim=-1)
+                # print(mask_sum)
+                full_loss = full_loss.sum(dim=-1) / (mask_sum + 1e-12)
+                mask_sum = mask_sum.to(torch.bool).sum()
+                # print(mask_sum)
+                loss = full_loss.sum() / (mask_sum + 1e-12)
+            else:
+                mask_sum = mask.to(full_loss.dtype).sum(dim=-1)
+                '''
+                print("mask:")
+                print(mask_sum)
+                print(mask_sum.to(torch.bool).sum())
+                '''
+                loss = full_loss.sum() / (mask_sum.to(torch.bool).sum() + 1e-12)
+            return loss
+        return full_loss

metrics.py ADDED Viewed

	@@ -0,0 +1,549 @@

+#!/usr/bin/env python
+# encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: sanyuan.**@**.com
+@tel: 137****6540
+@datetime: 2022/11/26 21:05
+@project: LucaOne
+@file: metrics.py
+@desc: metrics for binary classification or multi-class classification
+'''
+import csv
+import numpy as np
+import matplotlib.pyplot as plt
+plt.rcParams.update({'font.size': 18})
+plt.rcParams['axes.unicode_minus'] = False
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, \
+    average_precision_score, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
+def topk_accuracy_score(targets, probs, k=3):
+    '''
+    topk accuracy
+    :param targets:
+    :param probs:
+    :param k:
+    :return:
+    '''
+    # obtain top-k label
+    max_k_preds = probs.argsort(axis=1)[:, -k:][:, ::-1]
+    a_real = np.resize(targets, (targets.shape[0], 1))
+    # obtain the match result
+    match_array = np.logical_or.reduce(max_k_preds == a_real, axis=1)
+    topk_acc_score = match_array.sum() / match_array.shape[0]
+    return topk_acc_score
+def multi_class_acc(targets, probs, threshold=0.5):
+    if targets.ndim == 2:
+        targets = np.argmax(targets, axis=1)
+    preds = np.argmax(probs, axis=1)
+    return accuracy_score(targets, preds)
+def multi_class_precision(targets, probs, average='macro'):
+    if targets.ndim == 2:
+        targets = np.argmax(targets, axis=1)
+    preds = np.argmax(probs, axis=1)
+    return precision_score(targets, preds, average=average)
+def multi_class_recall(targets, probs, average='macro'):
+    if targets.ndim == 2:
+        targets = np.argmax(targets, axis=1)
+    preds = np.argmax(probs, axis=1)
+    return recall_score(targets, preds, average=average)
+def multi_class_f1(targets, probs, average='macro'):
+    if targets.ndim == 2:
+        targets = np.argmax(targets, axis=1)
+    preds = np.argmax(probs, axis=1)
+    return f1_score(targets, preds, average=average)
+def multi_class_roc_auc(targets, probs, average='macro'):
+    if targets.ndim == 2:
+        targets = np.argmax(targets, axis=1)
+    return roc_auc_score(targets, probs, average=average, multi_class='ovr')
+def multi_class_pr_auc(targets, probs, average='macro'):
+    if targets.ndim == 2:
+        targets = np.argmax(targets, axis=1)
+    z = probs.shape[1]
+    new_targets = np.eye(z)[targets]
+    pr_auc = average_precision_score(new_targets, probs, average=average)
+    return pr_auc
+def metrics_multi_class(targets, probs, average="macro"):
+    '''
+    metrics of multi-class classification
+    :param targets: 1d-array class index (n_samples, )
+    :param probs:  2d-array probability (n_samples, m_classes)
+    :return:
+    '''
+    if targets.ndim == 2 and targets.shape[1] > 1:
+        targets = np.argmax(targets, axis=1)
+    elif targets.ndim == 2 and targets.shape[1] == 1:
+        targets = np.squeeze(targets, axis=1)
+    preds = np.argmax(probs, axis=1)
+    acc = accuracy_score(targets, preds)
+    prec = precision_score(targets, preds, average=average)
+    recall = recall_score(targets, preds, average=average)
+    f1 = f1_score(targets, preds, average=average)
+    result = {
+        "acc": round(float(acc), 6),
+        "prec": round(float(prec), 6),
+        "recall": round(float(recall), 6),
+        "f1": round(float(f1), 6)
+    }
+    result.update({
+        "top2_acc": round(float(topk_accuracy_score(targets, probs, k=2)), 6),
+        "top3_acc": round(float(topk_accuracy_score(targets, probs, k=3)), 6),
+        "top5_acc": round(float(topk_accuracy_score(targets, probs, k=5)), 6),
+        "top10_acc": round(float(topk_accuracy_score(targets, probs, k=10)), 6)
+    })
+    try:
+        roc_auc = roc_auc_score(targets, probs, average=average, multi_class='ovr')
+        result.update({
+            "roc_auc": round(float(roc_auc), 6)
+        })
+    except Exception as e:
+        pass
+    try:
+        z = probs.shape[1]
+        new_targets = np.eye(z)[targets]
+        pr_auc = average_precision_score(new_targets, probs, average=average)
+        result.update({
+            "pr_auc": round(float(pr_auc), 6),
+        })
+    except Exception as e:
+        pass
+    return result
+def metrics_multi_class_for_pred(targets, preds, probs=None, average="macro", savepath=None):
+    '''
+    metrcis for multi-class classification
+    :param targets: 1d-array class index (n_samples, )
+    :param preds:  1d-array class index (n_samples, )
+    :return:
+    '''
+    if targets.ndim == 2 and targets.shape[1] > 1:
+        targets = np.argmax(targets, axis=1)
+    elif targets.ndim == 2 and targets.shape[1] == 1:
+        targets = np.squeeze(targets, axis=1)
+    acc = accuracy_score(targets, preds)
+    prec = precision_score(targets, preds, average=average)
+    recall = recall_score(targets, preds, average=average)
+    f1 = f1_score(y_true=targets, y_pred=preds, average=average)
+    result = {
+        "acc": round(float(acc), 6),
+        "prec": round(float(prec), 6),
+        "recall": round(float(recall), 6),
+        "f1": round(float(f1), 6)
+    }
+    try:
+        roc_auc = roc_auc_score(targets, probs, average=average, multi_class='ovr')
+        result.update({
+            "roc_auc": round(float(roc_auc), 6)
+        })
+    except Exception as e:
+        pass
+    try:
+        z = probs.shape[1]
+        new_targets = np.eye(z)[targets]
+        pr_auc = average_precision_score(new_targets, probs, average=average)
+        result.update({
+            "pr_auc": round(float(pr_auc), 6),
+        })
+    except Exception as e:
+        pass
+    return result
+def metrics_regression(targets, preds):
+    '''
+    metrcis for regression
+    :param targets: 1d-array class index (n_samples, )
+    :param preds:  1d-array class index (n_samples, )
+    :return:
+    '''
+    mae = mean_absolute_error(targets, preds)
+    mse = mean_squared_error(targets, preds)
+    r2 = r2_score(targets, preds)
+    return {
+        "mae": round(float(mae), 6),
+        "mse": round(float(mse), 6),
+        "r2": round(float(r2), 6)
+    }
+def transform(targets, probs, threshold):
+    '''
+    metrics of binary classification
+    :param targets: 1d-array class index (n_samples, )
+    :param probs: 1d-array larger class probability (n_samples, )
+    :param threshold: 0-1 prob threshokd
+    :return:
+    '''
+    if targets.ndim == 2:
+        if targets.shape[1] == 2: # [[0, 1], [1, 0]]
+            targets = np.argmax(targets, axis=1)
+        else: # [[1], [0]]
+            targets = targets.flatten()
+    if probs.ndim == 2:
+        if probs.shape[1] == 2: # [[0.1, 0.9], [0.9, 0.1]]
+            preds = np.argmax(probs, axis=1)
+            probs = probs[:, 1].flatten()
+        else: # [[0.9], [0.1]]
+            preds = (probs >= threshold).astype(int).flatten()
+            probs = probs.flatten()
+    else:
+        preds = (probs >= threshold).astype(int)
+    return targets, probs, preds
+def binary_acc(targets, probs, threshold=0.5):
+    targets, probs, preds = transform(targets, probs, threshold)
+    return accuracy_score(targets, preds)
+def binary_precision(targets, probs, threshold=0.5, average='binary'):
+    targets, probs, preds = transform(targets, probs, threshold)
+    return precision_score(targets, preds, average=average)
+def binary_recall(targets, probs, threshold=0.5, average='binary'):
+    targets, probs, preds = transform(targets, probs, threshold)
+    return recall_score(targets, preds, average=average)
+def binary_f1(targets, probs, threshold=0.5, average='binary'):
+    targets, probs, preds = transform(targets, probs, threshold)
+    return f1_score(targets, preds, average=average)
+def binary_roc_auc(targets, probs, threshold=0.5, average='macro'):
+    targets, probs, preds = transform(targets, probs, threshold)
+    return roc_auc_score(targets, probs, average=average)
+def binary_pr_auc(targets, probs, threshold=0.5, average='macro'):
+    targets, probs, preds = transform(targets, probs, threshold)
+    return average_precision_score(targets, probs, average=average)
+def binary_confusion_matrix(targets, probs, threshold=0.5, savepath=None):
+    targets, probs, preds = transform(targets, probs, threshold)
+    cm_obj = confusion_matrix(targets, preds, labels=[0, 1])
+    plot_confusion_matrix_for_binary_class(targets, preds, cm=cm_obj, savepath=savepath)
+    tn, fp, fn, tp = cm_obj.ravel()
+    cm = {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)}
+    return cm
+def metrics_binary(targets, probs, threshold=0.5, average="binary", savepath=None):
+    '''
+    metrics for binary classification
+    :param targets: 1d-array class index (n_samples, )
+    :param probs: 1d-array larger class probability (n_samples, )
+    :param threshold: 0-1 prob threshold
+    :return:
+    '''
+    if targets.ndim == 2:
+        if targets.shape[1] == 2: # [[0, 1], [1, 0]]
+            targets = np.argmax(targets, axis=1)
+        else: # [[1], [0]]
+            targets = targets.flatten()
+    if probs.ndim == 2:
+        if probs.shape[1] == 2: # [[0.1, 0.9], [0.9, 0.1]]
+            preds = np.argmax(probs, axis=1)
+            probs = probs[:, 1].flatten()
+        else: # [[0.9], [0.1]]
+            preds = (probs >= threshold).astype(int).flatten()
+            probs = probs.flatten()
+    else:
+        preds = (probs >= threshold).astype(int)
+    acc = accuracy_score(targets, preds)
+    prec = precision_score(targets, preds, average=average)
+    recall = recall_score(targets, preds, average=average)
+    f1 = f1_score(targets, preds, average=average)
+    result = {
+        "acc": round(float(acc), 6),
+        "prec": round(float(prec), 6),
+        "recall": round(float(recall), 6),
+        "f1": round(float(f1), 6)
+    }
+    try:
+        roc_auc = roc_auc_score(targets, probs, average="macro")
+        result.update({
+            "roc_auc": round(float(roc_auc), 6)
+        })
+    except Exception as e:
+        pass
+    try:
+        pr_auc = average_precision_score(targets, probs, average="macro")
+        result.update({
+            "pr_auc": round(float(pr_auc), 6)
+        })
+    except Exception as e:
+        pass
+    try:
+        cm_obj = confusion_matrix(targets, preds, labels=[0, 1])
+        plot_confusion_matrix_for_binary_class(targets, preds, cm=cm_obj, savepath=savepath)
+        tn, fp, fn, tp = cm_obj.ravel()
+        cm = {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)}
+        result.update({
+            "confusion_matrix": cm
+        })
+    except Exception as e:
+        pass
+    # add mcc
+    try:
+        tn, fp, fn, tp = cm["tn"], cm["fp"], cm["fn"], cm["tp"]
+        mcc = (tn*tp - fp*fn) / (((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)) ** 0.5)
+        result.update({
+            "mcc": round(mcc, 6)
+        })
+    except Exception as e:
+        pass
+    return result
+def metrics_binary_for_pred(targets, preds, probs=None, average="binary", savepath=None):
+    '''
+    metrics for binary classification
+    :param targets: 1d-array class index (n_samples, )
+    :param preds: 1d-array larger class index (n_samples, )
+    :return:
+    '''
+    if targets.ndim == 2:
+        if targets.shape[1] == 2: # [[1, 0], [0, 1]
+            targets = np.argmax(targets, axis=1)
+        else: # [[1], [0]]
+            targets = targets.flatten()
+    if preds.ndim == 2:
+        if preds.shape[1] == 2: # [[0.9, 0.1], [0.1, 0.9]]
+            preds = np.argmax(preds, axis=1)
+        else: # [[0], [1]]
+            preds = preds.flatten()
+    cm_obj = confusion_matrix(targets, preds, labels=[0, 1])
+    plot_confusion_matrix_for_binary_class(targets, preds, cm=cm_obj, savepath=savepath)
+    tn, fp, fn, tp = cm_obj.ravel()
+    cm = {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)}
+    if len(np.unique(targets)) > 1:
+        acc = accuracy_score(targets, preds)
+        prec = precision_score(targets, preds, average=average)
+        recall = recall_score(targets, preds, average=average)
+        f1 = f1_score(y_true=targets, y_pred=preds, average=average)
+        result = {
+            "acc": round(float(acc), 6),
+            "prec": round(float(prec), 6),
+            "recall": round(float(recall), 6),
+            "f1": round(float(f1), 6)
+        }
+    else:
+        result = {
+            "acc": round(float((cm["tp"] + cm["tn"]) / (cm["tp"] + cm["tn"] + cm["fp"] + cm["fn"])), 6),
+            "prec": round(float(cm["tp"]/(cm["tp"] + cm["fp"]) if cm["tp"] + cm["fp"] > 0 else 1.0), 6),
+            "recall": round(float(cm["tp"]/(cm["tp"] + cm["fn"]) if cm["tp"] + cm["fn"] > 0 else 1.0), 6),
+        }
+        result["f1"] = 2 * result["prec"] * result["recall"] / (result["prec"] + result["recall"])
+    try:
+        pr_auc = average_precision_score(targets, probs, average="macro")
+        result.update({
+            "pr_auc": round(float(pr_auc), 6)
+        })
+    except Exception as e:
+        pass
+    try:
+        roc_auc = roc_auc_score(targets, probs, average="macro")
+        result.update({
+            "roc_auc": round(float(roc_auc), 6)
+        })
+    except Exception as e:
+        pass
+    try:
+        tn, fp, fn, tp = cm["tn"], cm["fp"], cm["fn"], cm["tp"]
+        mcc = (tn*tp - fp*fn) / (((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)) ** 0.5)
+        result.update({
+            "mcc": round(mcc, 6)
+        })
+    except Exception as e:
+        pass
+    result.update({
+        "confusion_matrix": cm
+    })
+    return result
+def write_error_samples_multi_class(filepath, samples, input_indexs, input_id_2_names, output_id_2_name, targets, probs,
+                                    use_other_diags=False, use_other_operas=False, use_checkin_department=False):
+    '''
+    write the bad cases of multi-class classification
+    :param filepath:
+    :param samples:
+    :param input_indexs:
+    :param input_id_2_names:
+    :param output_id_2_name:
+    :param targets:
+    :param probs:
+    :param use_other_diags:
+    :param use_other_operas:
+    :param use_checkin_department:
+    :return:
+    '''
+    targets = np.argmax(targets, axis=1)
+    preds = np.argmax(probs, axis=1)
+    with open(filepath, "w") as fp:
+        writer = csv.writer(fp)
+        writer.writerow(["score", "y_true", "y_pred", "inputs"])
+        for i in range(len(targets)):
+            target = targets[i]
+            pred = preds[i]
+            score = 1
+            if target != pred:
+                score = 0
+            if output_id_2_name:
+                target_label = output_id_2_name[target]
+                pred_label = output_id_2_name[pred]
+            else:
+                target_label = target
+                pred_label = pred
+            sample = samples[i]
+            if input_id_2_names:
+                new_sample = []
+                for idx, input_index in enumerate(input_indexs):
+                    if input_index == 3 and not use_checkin_department:
+                        input_index = 12
+                    new_sample.append([input_id_2_names[idx][v] for v in sample[input_index]])
+                    if (input_index == 6 and use_other_diags) or (input_index == 8 and use_other_operas) or (input_index == 10 and use_other_diags):
+                        new_sample.append([input_id_2_names[idx][v] for v in sample[input_index + 1]])
+            else:
+                new_sample = sample
+            row = [score, target_label, pred_label, new_sample]
+            writer.writerow(row)
+def write_error_samples_binary(filepath, samples, input_indexs, input_id_2_names, targets, probs, threshold=0.5,
+                               use_other_diags=False, use_other_operas=False, use_checkin_department=False):
+    '''
+    write bad cases of binary classification
+    :param filepath:
+    :param samples:
+    :param input_indexs:
+    :param input_id_2_names:
+    :param targets:
+    :param probs:
+    :param threshold:
+    :param use_other_diags:
+    :param use_other_operas:
+    :param use_checkin_department:
+    :return:
+    '''
+    with open(filepath, "w") as fp:
+        writer = csv.writer(fp)
+        writer.writerow(["score", "y_true", "y_pred", "inputs"])
+        for i in range(len(targets)):
+            target = targets[i][0]
+            if target != 1:
+                target = 1
+            prob = probs[i][0]
+            if prob >= threshold:
+                pred = 1
+            else:
+                pred = 0
+            score = 1
+            if target != pred:
+                score = 0
+            target_label = "True" if target == 1 else "False"
+            pred_label = "True" if target == 1 else "False"
+            sample = samples[i]
+            if input_id_2_names:
+                new_sample = []
+                for idx, input_index in enumerate(input_indexs):
+                    if input_index == 3 and not use_checkin_department:
+                        input_index = 12
+                    new_sample.append([input_id_2_names[idx][v] for v in sample[input_index]])
+                    if (input_index == 6 and use_other_diags) or (input_index == 8 and use_other_operas) or (input_index == 10 and use_other_diags):
+                        new_sample.append([input_id_2_names[idx][v] for v in sample[input_index + 1]])
+            else:
+                new_sample = sample
+            row = [score, target_label, pred_label, new_sample]
+            writer.writerow(row)
+def plot_confusion_matrix_for_binary_class(targets, preds, cm=None, savepath=None):
+    '''
+    :param targets: ground truth
+    :param preds: prediction probs
+    :param cm: confusion matrix
+    :param savepath: confusion matrix picture savepth
+    '''
+    plt.figure(figsize=(40, 20), dpi=100)
+    if cm is None:
+        cm = confusion_matrix(targets, preds, labels=[0, 1])
+    plt.matshow(cm, cmap=plt.cm.Oranges)
+    plt.colorbar()
+    for x in range(len(cm)):
+        for y in range(len(cm)):
+            plt.annotate(cm[x, y], xy=(y, x), verticalalignment='center', horizontalalignment='center')
+    plt.ylabel('True')
+    plt.xlabel('Prediction')
+    if savepath:
+        plt.savefig(savepath, dpi=100)
+    else:
+        plt.show()
+    plt.close("all")
+if __name__ == "__main__":
+    '''multi_class'''
+    targets = np.array([0, 1, 2, 1, 3])
+    probs = np.array([[0.9, 0.05, 0.05, 0], [0.5, 0.45, 0.05, 0], [0.4, 0.05, 0.55, 0], [0.1, 0.55, 0.25, 0.1], [0.4, 0.25, 0.35, 0]])
+    print(metrics_multi_class(targets, probs))
+    targets = np.array([0, 1, 2, 3, 3])
+    probs = np.array([[0.9, 0.05, 0.05, 0], [0.5, 0.45, 0.05, 0], [0.4, 0.05, 0.55, 0], [0.1, 0.25, 0.25, 0.4], [0.1, 0.25, 0.25, 0.4]])
+    print(metrics_multi_class(targets, probs))
+    targets = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 1]])
+    probs = np.array([[0.9, 0.05, 0.05, 0], [0.5, 0.45, 0.05, 0], [0.4, 0.05, 0.55, 0], [0.1, 0.25, 0.25, 0.4], [0.1, 0.25, 0.25, 0.4]])
+    print(metrics_multi_class(targets, probs))
+    '''binary'''
+    targets = np.array([0, 0, 1, 1])
+    probs = np.array([[0.1], [0.1], [0.1], [0.9]])
+    print(metrics_binary(targets, probs))
+    targets = np.array([[0], [0], [1], [1]])
+    probs = np.array([[0.1], [0.1], [0.1], [0.9]])
+    print(metrics_binary(targets, probs))
+    targets = np.array([0, 0, 1, 1])
+    probs = np.array([[0.1, 0.1, 0.1, 0.9]])
+    print(metrics_binary(targets, probs))
+    targets = np.array([0, 0, 1, 1])
+    probs = np.array([0.1, 0.1, 0.1, 0.9])
+    print(metrics_binary(targets, probs))
+    targets = np.array([0, 1, 2, 1, 3])
+    probs = np.array([[0.9, 0.05, 0.05, 0], [0.5, 0.45, 0.05, 0], [0.4, 0.05, 0.55, 0], [0.1, 0.55, 0.25, 0.1], [0.4, 0.25, 0.25, 0.1]])
+    z = probs.shape[1]
+    # print(z)
+    print(np.eye(z))
+    new_targets = np.eye(z)[targets]
+    print(new_targets)

model_utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python
+# encoding: utf-8
+from typing import Optional, Tuple
+from dataclasses import dataclass
+from transformers.modeling_outputs import ModelOutput
+import sys, copy, math
+from .pooling import *
+from .loss import *
+@dataclass
+class AllOutput(ModelOutput):
+    losses: Optional[dict[str, dict[str, torch.FloatTensor]]] = None
+    outputs: Optional[dict[str, dict[str, torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contacts: Optional[Tuple[torch.FloatTensor]] = None
+    losses_b: Optional[dict[str, dict[str, torch.FloatTensor]]] = None
+    outputs_b: Optional[dict[str, dict[str, torch.FloatTensor]]] = None
+    hidden_states_b: Optional[Tuple[torch.FloatTensor]] = None
+    attentions_b: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions_b: Optional[Tuple[torch.FloatTensor]] = None
+    global_attentions_b: Optional[Tuple[torch.FloatTensor]] = None
+    contacts_b: Optional[Tuple[torch.FloatTensor]] = None
+    pair_outputs: Optional[Tuple[torch.FloatTensor]] = None
+    pair_losses: Optional[dict[str, dict[str, torch.FloatTensor]]] = None
+def create_pooler(task_level_type, task_level_name, config, args):
+    '''
+    pooler building
+    :param task_level_type:
+    :param task_level_name:
+    :param config:
+    :param args:
+    :return:
+    '''
+    hidden_size = config.hidden_size[task_level_type][task_level_name]
+    pooling_type = args.pooling_type[task_level_type][task_level_name]
+    if pooling_type == "max":
+        return GlobalMaskMaxPooling1D()
+    elif pooling_type == "sum":
+        return GlobalMaskSumPooling1D(axis=1)
+    elif pooling_type == "avg":
+        return GlobalMaskAvgPooling1D()
+    elif pooling_type == "attention":
+        return GlobalMaskContextAttentionPooling1D(embed_size=hidden_size)
+    elif pooling_type == "context_attention":
+        return GlobalMaskContextAttentionPooling1D(embed_size=hidden_size)
+    elif pooling_type == "weighted_attention":
+        return GlobalMaskWeightedAttentionPooling1D(embed_size=hidden_size)
+    elif pooling_type == "value_attention":
+        return GlobalMaskValueAttentionPooling1D(embed_size=hidden_size)
+    elif pooling_type == "transformer":
+        copy_config = copy.deepcopy(config)
+        copy_config.hidden_size = hidden_size
+        return GlobalMaskTransformerPooling1D(copy_config)
+    else:
+        return None
+def create_output_loss_lucagplm(task_level_type, task_level_name, config):
+    '''not cls module'''
+    if not hasattr(config, "sigmoid"):
+        config.sigmoid = {task_level_type: {}}
+    elif task_level_type not in config.sigmoid:
+        config.sigmoid[task_level_type] = {}
+    config.sigmoid[task_level_type][task_level_name] = False if config.output_mode[task_level_type][task_level_name] \
+                                                              in ["multi_class", "multi-class", "regression"] else True
+    # 特殊情况，contact需要是sigmoid, 需要思考strcuture需不需要sigmoid
+    if task_level_name == "prot_contact":
+        config.sigmoid[task_level_type][task_level_name] = True
+    config.num_labels = config.label_size[task_level_type][task_level_name]
+    if task_level_type in ["token_level", "whole_level"]:
+        return_types = ["output", "loss"]
+    else:
+        return_types = ["dropout", "hidden_layer", "hidden_act", "classifier", "output", "loss"]
+    return create_loss_function(config,
+                                task_level_type=task_level_type,
+                                task_level_name=task_level_name,
+                                sigmoid=config.sigmoid[task_level_type][task_level_name],
+                                output_mode=config.output_mode[task_level_type][task_level_name],
+                                num_labels=config.num_labels,
+                                loss_type=config.loss_type[task_level_type][task_level_name],
+                                ignore_index=config.ignore_index,
+                                pair_level=True if task_level_type == "pair_level" else False,
+                                return_types=return_types)
+def create_output_loss(task_level_type, task_level_name, cls_module, config, args):
+    cls = None
+    if task_level_type in ["token_level", "whole_level"]:
+        cls = cls_module(config)
+    dropout, hidden_layer, hidden_act, classifier, output, loss_fct = create_output_loss_lucagplm(task_level_type, task_level_name, config, args)
+    return cls, dropout, hidden_layer, hidden_act, classifier, output, loss_fct

modeling_bert.py ADDED Viewed

	@@ -0,0 +1,1917 @@

+#!/usr/bin/env python
+# encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: sanyuan.**@**.com
+@tel: 137****6540
+@datetime: 2022/12/2 09:38
+@project: LucaOneTasks
+@file: modeling_bert
+@desc: transformer layers
+'''
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.bert.configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+# TokenClassification docstring
+_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
+_TOKEN_CLASS_EXPECTED_OUTPUT = (
+    "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] "
+)
+_TOKEN_CLASS_EXPECTED_LOSS = 0.01
+# QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2"
+_QA_EXPECTED_OUTPUT = "'a nice puppet'"
+_QA_EXPECTED_LOSS = 7.41
+_QA_TARGET_START_INDEX = 14
+_QA_TARGET_END_INDEX = 15
+# SequenceClassification docstring
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.01
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+                for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        if hasattr(config, "no_token_embeddings"):
+            self.no_token_embeddings = config.no_token_embeddings
+        else:
+            self.no_token_embeddings = False
+        if not self.no_token_embeddings:
+            self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        if hasattr(config, "no_position_embeddings"):
+            self.no_position_embeddings = config.no_position_embeddings
+        else:
+            self.no_position_embeddings = False
+        if hasattr(config, "no_token_type_embeddings"):
+            self.no_token_type_embeddings = config.no_token_type_embeddings
+        else:
+            self.no_token_type_embeddings = False
+        if not self.no_position_embeddings:
+            self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        if not self.no_token_type_embeddings:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        if not self.no_position_embeddings:
+            self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+            self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if not self.no_token_type_embeddings and not self.no_position_embeddings:
+            if version.parse(torch.__version__) > version.parse("1.6.0"):
+                self.register_buffer(
+                    "token_type_ids",
+                    torch.zeros(self.position_ids.size(), dtype=torch.long),
+                    persistent=False,
+                )
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            token_type_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if not self.no_position_embeddings and position_ids is None :
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if not self.no_token_type_embeddings and token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device if input_ids is not None else inputs_embeds.device)
+        if self.no_token_embeddings and inputs_embeds is None:
+            raise Exception("The model has not token_embeddings layer, the inputs_embeds cannot None")
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = inputs_embeds
+        if not self.no_token_type_embeddings:
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings += token_type_embeddings
+        if not self.no_position_embeddings and self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.is_decoder = config.is_decoder
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BertAttention(config, position_embedding_type="absolute")
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = False,
+            output_hidden_states: Optional[bool] = False,
+            return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+BERT_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+    def __init__(self, config, use_pretrained_embedding=False, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.use_pretrained_embedding = use_pretrained_embedding
+        self.add_pooling_layer = add_pooling_layer
+        self.embeddings = nn.Linear(config.embedding_input_size, config.hidden_size) if use_pretrained_embedding else BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if self.use_pretrained_embedding:
+            embedding_output = self.embeddings(inputs_embeds)
+        else:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            next_sentence_label: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        if not config.is_decoder:
+            logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.Tensor]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'paris'",
+        expected_loss=0.88,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top.""",
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            **kwargs,
+    ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```
+        """
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        seq_relationship_scores = self.cls(pooled_output)
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.bert = BertModel(config)
+        classifier_dropout_prob = (
+            config.classifier_dropout_prob if config.classifier_dropout_prob is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        classifier_dropout_prob = (
+            config.classifier_dropout_prob if config.classifier_dropout_prob is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout_prob = (
+            config.classifier_dropout_prob if config.classifier_dropout_prob is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_QA,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=_QA_TARGET_START_INDEX,
+        qa_target_end_index=_QA_TARGET_END_INDEX,
+        expected_output=_QA_EXPECTED_OUTPUT,
+        expected_loss=_QA_EXPECTED_LOSS,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            token_type_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            start_positions: Optional[torch.Tensor] = None,
+            end_positions: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

modeling_gplm.py ADDED Viewed

	@@ -0,0 +1,1225 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import math
+from typing import Dict, Optional, Sequence, Tuple, List, Union
+import uuid
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import Parameter
+def gelu(x):
+    """Implementation of the gelu activation function.
+    OpenAI GPT's gelu: 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def symmetrize(x):
+    "Make layer symmetric in final two dimensions, used for contact prediction."
+    return x + x.transpose(-1, -2)
+def apc(x):
+    "Perform average product correct, used for contact prediction."
+    a1 = x.sum(-1, keepdims=True)
+    a2 = x.sum(-2, keepdims=True)
+    a12 = x.sum((-1, -2), keepdims=True)
+    avg = a1 * a2
+    avg.div_(a12)  # in-place to reduce memory
+    normalized = x - avg
+    return normalized
+class LucaGPLM1LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12, affine=True):
+        """Construct a layernorm layer in the TF style (eps inside the sqrt)."""
+        super().__init__()
+        self.hidden_size = (hidden_size,) if isinstance(hidden_size, int) else tuple(hidden_size)
+        self.eps = eps
+        self.affine = bool(affine)
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+        else:
+            self.weight, self.bias = None, None
+    def forward(self, x):
+        dims = tuple(-(i + 1) for i in range(len(self.hidden_size)))
+        means = x.mean(dims, keepdim=True)
+        x_zeromean = x - means
+        variances = x_zeromean.pow(2).mean(dims, keepdim=True)
+        x = x_zeromean / torch.sqrt(variances + self.eps)
+        if self.affine:
+            x = (self.weight * x) + self.bias
+        return x
+try:
+    # Optimized LayerNorm
+    from apex.normalization import FusedLayerNorm as _FusedLayerNorm
+    class LucaGPLM1bLayerNorm(_FusedLayerNorm):
+        @torch.jit.unused
+        def forward(self, x):
+            if not x.is_cuda:
+                return super().forward(x)
+            else:
+                with torch.cuda.device(x.device):
+                    return super().forward(x)
+except ImportError as e:
+    print("import apex err:", e)
+    from torch.nn import LayerNorm as LucaGPLM1bLayerNorm
+class LucaGPLMTransformerLayer(nn.Module):
+    """LucaGPLM Transformer layer block."""
+    def __init__(
+            self,
+            embed_dim,
+            ffn_embed_dim,
+            attention_heads,
+            add_bias_kv=True,
+            use_lucagplm1b_layer_norm=False,
+            use_rotary_embeddings: bool = False,
+    ):
+        '''
+        Tramsformer-Encoder 层
+        :param embed_dim: token embedding dim
+        :param ffn_embed_dim: fully connected layer dim
+        :param attention_heads: heads num
+        :param add_bias_kv: key-value layer add bias
+        :param use_lucagplm1b_layer_norm:  whether to use lucagplm 1b layer norm
+        :param use_rotary_embeddings: whether to use rotary embedding
+        '''
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.ffn_embed_dim = ffn_embed_dim
+        self.attention_heads = attention_heads
+        self.use_rotary_embeddings = use_rotary_embeddings
+        self._init_submodules(add_bias_kv, use_lucagplm1b_layer_norm)
+    def _init_submodules(self, add_bias_kv, use_lucagplm1b_layer_norm):
+        LucaGPLMLayerNorm = LucaGPLM1bLayerNorm if use_lucagplm1b_layer_norm else LucaGPLM1LayerNorm
+        # pre layer norm
+        self.pre_layer_norm = LucaGPLMLayerNorm(self.embed_dim)
+        self.self_attn = LucaGPLMMultiheadAttention(
+            self.embed_dim,
+            self.attention_heads,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=False,
+            use_rotary_embeddings=self.use_rotary_embeddings,
+        )
+        # post layer norm
+        self.post_layer_norm = LucaGPLMLayerNorm(self.embed_dim)
+        # dimension increase by the fully connected layer
+        self.fc1 = nn.Linear(self.embed_dim, self.ffn_embed_dim)
+        # dimension reduction by the fully connected layer
+        self.fc2 = nn.Linear(self.ffn_embed_dim, self.embed_dim)
+    def forward(
+            self,
+            x,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+            need_head_weights=False
+    ):
+        residual = x
+        x = self.pre_layer_norm(x)
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=True,
+            need_head_weights=need_head_weights,
+            attn_mask=self_attn_mask,
+        )
+        x = residual + x
+        residual = x
+        x = self.post_layer_norm(x)
+        x = gelu(self.fc1(x))
+        x = self.fc2(x)
+        x = residual + x
+        return x, attn
+class AxialTransformerLayer(nn.Module):
+    """Implements an Axial MSA Transformer block."""
+    def __init__(
+            self,
+            embedding_dim: int = 768,
+            ffn_embedding_dim: int = 3072,
+            num_attention_heads: int = 8,
+            dropout: float = 0.1,
+            attention_dropout: float = 0.1,
+            activation_dropout: float = 0.1,
+            max_tokens_per_msa: int = 2**14,
+    ) -> None:
+        super().__init__()
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.dropout_prob = dropout
+        row_self_attention = RowSelfAttention(
+            embedding_dim,
+            num_attention_heads,
+            dropout=dropout,
+            max_tokens_per_msa=max_tokens_per_msa,
+        )
+        column_self_attention = ColumnSelfAttention(
+            embedding_dim,
+            num_attention_heads,
+            dropout=dropout,
+            max_tokens_per_msa=max_tokens_per_msa,
+        )
+        feed_forward_layer = FeedForwardNetwork(
+            embedding_dim,
+            ffn_embedding_dim,
+            activation_dropout=activation_dropout,
+            max_tokens_per_msa=max_tokens_per_msa,
+        )
+        self.row_self_attention = self.build_residual(row_self_attention)
+        self.column_self_attention = self.build_residual(column_self_attention)
+        self.feed_forward_layer = self.build_residual(feed_forward_layer)
+    def build_residual(self, layer: nn.Module):
+        return NormalizedResidualBlock(
+            layer,
+            self.embedding_dim,
+            self.dropout_prob,
+        )
+    def forward(
+            self,
+            x: torch.Tensor,
+            self_attn_mask: Optional[torch.Tensor] = None,
+            self_attn_padding_mask: Optional[torch.Tensor] = None,
+            need_head_weights: bool = False,
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer implementation.
+        """
+        x, row_attn = self.row_self_attention(
+            x,
+            self_attn_mask=self_attn_mask,
+            self_attn_padding_mask=self_attn_padding_mask,
+        )
+        x, column_attn = self.column_self_attention(
+            x,
+            self_attn_mask=self_attn_mask,
+            self_attn_padding_mask=self_attn_padding_mask,
+        )
+        x = self.feed_forward_layer(x)
+        if need_head_weights:
+            return x, column_attn, row_attn
+        else:
+            return x
+class LearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    Padding ids are ignored by either offsetting based on padding_idx
+    or by setting padding_idx to None and ensuring that the appropriate
+    position ids are passed to the forward function.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        if padding_idx is not None:
+            num_embeddings_ = num_embeddings + padding_idx + 1
+        else:
+            num_embeddings_ = num_embeddings
+        super().__init__(num_embeddings_, embedding_dim, padding_idx)
+        self.max_positions = num_embeddings
+    def forward(self, input: torch.Tensor):
+        """Input is expected to be of size [bsz x seqlen]."""
+        if input.size(1) > self.max_positions:
+            raise ValueError(
+                f"Sequence length {input.size(1)} above maximum "
+                f" sequence length of {self.max_positions}"
+            )
+        mask = input.ne(self.padding_idx).int()
+        positions = (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + self.padding_idx
+        return F.embedding(
+            positions,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+class SinusoidalPositionalEmbedding(nn.Module):
+    def __init__(self, embed_dim, padding_idx, learned=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.padding_idx = padding_idx
+        self.register_buffer("_float_tensor", torch.FloatTensor(1))
+        self.weights = None
+    def forward(self, x):
+        bsz, seq_len = x.shape
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.size(0):
+            self.weights = self.get_embedding(max_pos)
+        self.weights = self.weights.type_as(self._float_tensor)
+        positions = self.make_positions(x)
+        return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
+    def make_positions(self, x):
+        mask = x.ne(self.padding_idx)
+        range_buf = torch.arange(x.size(1), device=x.device).expand_as(x) + self.padding_idx + 1
+        positions = range_buf.expand_as(x)
+        return positions * mask.long() + self.padding_idx * (1 - mask.long())
+    def get_embedding(self, num_embeddings):
+        half_dim = self.embed_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if self.embed_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if self.padding_idx is not None:
+            emb[self.padding_idx, :] = 0
+        return emb
+class RobertaLMHead(nn.Module):
+    """Head for masked language modeling."""
+    def __init__(self, embed_dim, output_dim, weight):
+        super().__init__()
+        self.dense = nn.Linear(embed_dim, embed_dim)
+        self.layer_norm = LucaGPLM1bLayerNorm(embed_dim)
+        self.weight = weight
+        self.bias = nn.Parameter(torch.zeros(output_dim))
+    def forward(self, features):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+        # project back to size of vocabulary with bias
+        x = F.linear(x, self.weight) + self.bias
+        return x
+class ContactPredictionHead(nn.Module):
+    """Performs symmetrization, apc, and computes a logistic regression on the output features"""
+    def __init__(
+            self,
+            in_features: int,
+            prepend_bos: bool,
+            append_eos: bool,
+            bias=True,
+            eos_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.prepend_bos = prepend_bos
+        self.append_eos = append_eos
+        if append_eos and eos_idx is None:
+            raise ValueError("Using an alphabet with eos token, but no eos token was passed in.")
+        self.eos_idx = eos_idx
+        self.regression = nn.Linear(in_features, 1, bias)
+        self.activation = nn.Sigmoid()
+    def forward(self, tokens, attentions):
+        # remove eos token attentions
+        if self.append_eos:
+            eos_mask = tokens.ne(self.eos_idx).to(attentions)
+            eos_mask = eos_mask.unsqueeze(1) * eos_mask.unsqueeze(2)
+            attentions = attentions * eos_mask[:, None, None, :, :]
+            attentions = attentions[..., :-1, :-1]
+        # remove cls token attentions
+        if self.prepend_bos:
+            attentions = attentions[..., 1:, 1:]
+        batch_size, layers, heads, seqlen, _ = attentions.size()
+        attentions = attentions.view(batch_size, layers * heads, seqlen, seqlen)
+        # features: B x C x T x T
+        attentions = attentions.to(
+            self.regression.weight.device
+        )  # attentions always float32, may need to convert to float16
+        attentions = apc(symmetrize(attentions))
+        attentions = attentions.permute(0, 2, 3, 1)
+        return self.activation(self.regression(attentions).squeeze(3))
+class NormalizedResidualBlock(nn.Module):
+    def __init__(
+            self,
+            layer: nn.Module,
+            embedding_dim: int,
+            dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.layer = layer
+        self.dropout_module = nn.Dropout(
+            dropout,
+        )
+        self.layer_norm = LucaGPLM1bLayerNorm(self.embedding_dim)
+    def forward(self, x, *args, **kwargs):
+        residual = x
+        x = self.layer_norm(x)
+        outputs = self.layer(x, *args, **kwargs)
+        if isinstance(outputs, tuple):
+            x, *out = outputs
+        else:
+            x = outputs
+            out = None
+        x = self.dropout_module(x)
+        x = residual + x
+        if out is not None:
+            return (x,) + tuple(out)
+        else:
+            return x
+class FeedForwardNetwork(nn.Module):
+    def __init__(
+            self,
+            embedding_dim: int,
+            ffn_embedding_dim: int,
+            activation_dropout: float = 0.1,
+            max_tokens_per_msa: int = 2**14,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.ffn_embedding_dim = ffn_embedding_dim
+        self.max_tokens_per_msa = max_tokens_per_msa
+        self.activation_fn = nn.GELU()
+        self.activation_dropout_module = nn.Dropout(
+            activation_dropout,
+        )
+        self.fc1 = nn.Linear(embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, embedding_dim)
+    def forward(self, x):
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        return x
+class RowSelfAttention(nn.Module):
+    """Compute self-attention over rows of a 2D input."""
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            dropout=0.0,
+            max_tokens_per_msa: int = 2 ** 16,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim ** -0.5
+        self.max_tokens_per_msa = max_tokens_per_msa
+        self.attn_shape = "hnij"
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.dropout_module = nn.Dropout(dropout)
+    def align_scaling(self, q):
+        num_rows = q.size(0)
+        return self.scaling / math.sqrt(num_rows)
+    def _batched_forward(
+            self,
+            x,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        max_rows = max(1, self.max_tokens_per_msa // num_cols)
+        attns = 0
+        scaling = self.align_scaling(x)
+        for start in range(0, num_rows, max_rows):
+            attn_weights = self.compute_attention_weights(
+                x[start : start + max_rows],
+                scaling,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask[:, start : start + max_rows]
+                if self_attn_padding_mask is not None
+                else None,
+            )
+            attns += attn_weights
+        attn_probs = attns.softmax(-1)
+        attn_probs = self.dropout_module(attn_probs)
+        outputs = []
+        for start in range(0, num_rows, max_rows):
+            output = self.compute_attention_update(x[start : start + max_rows], attn_probs)
+            outputs.append(output)
+        output = torch.cat(outputs, 0)
+        return output, attn_probs
+    def compute_attention_weights(
+            self,
+            x,
+            scaling: float,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+        k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+        q *= scaling
+        if self_attn_padding_mask is not None:
+            # Zero out any padded aligned positions - this is important since
+            # we take a sum across the alignment axis.
+            q *= 1 - self_attn_padding_mask.permute(1, 2, 0).unsqueeze(3).unsqueeze(4).to(q)
+        attn_weights = torch.einsum(f"rinhd,rjnhd->{self.attn_shape}", q, k)
+        if self_attn_mask is not None:
+            raise NotImplementedError
+            # Mask Size: [B x R x C], Weights Size: [H x B x C x C]
+        if self_attn_padding_mask is not None:
+            attn_weights = attn_weights.masked_fill(
+                self_attn_padding_mask[:, 0].unsqueeze(0).unsqueeze(2),
+                -10000,
+            )
+        return attn_weights
+    def compute_attention_update(
+            self,
+            x,
+            attn_probs,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+        context = torch.einsum(f"{self.attn_shape},rjnhd->rinhd", attn_probs, v)
+        context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
+        output = self.out_proj(context)
+        return output
+    def forward(
+            self,
+            x,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        if (num_rows * num_cols > self.max_tokens_per_msa) and not torch.is_grad_enabled():
+            return self._batched_forward(x, self_attn_mask, self_attn_padding_mask)
+        else:
+            scaling = self.align_scaling(x)
+            attn_weights = self.compute_attention_weights(
+                x, scaling, self_attn_mask, self_attn_padding_mask
+            )
+            attn_probs = attn_weights.softmax(-1)
+            attn_probs = self.dropout_module(attn_probs)
+            output = self.compute_attention_update(x, attn_probs)
+            return output, attn_probs
+class ColumnSelfAttention(nn.Module):
+    """Compute self-attention over columns of a 2D input."""
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            dropout=0.0,
+            max_tokens_per_msa: int = 2 ** 16,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim ** -0.5
+        self.max_tokens_per_msa = max_tokens_per_msa
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.dropout_module = nn.Dropout(dropout)
+    def _batched_forward(
+            self,
+            x,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        max_cols = max(1, self.max_tokens_per_msa // num_rows)
+        outputs = []
+        attns = []
+        for start in range(0, num_cols, max_cols):
+            output, attn = self(
+                x[:, start : start + max_cols],
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask[:, :, start : start + max_cols]
+                if self_attn_padding_mask is not None
+                else None,
+            )
+            outputs.append(output)
+            attns.append(attn)
+        output = torch.cat(outputs, 1)
+        attns = torch.cat(attns, 1)
+        return output, attns
+    def compute_attention_update(
+            self,
+            x,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        if num_rows == 1:
+            # if there is only 1 position, this is equivalent and doesn't break with padding
+            attn_probs = torch.ones(
+                self.num_heads,
+                num_cols,
+                batch_size,
+                num_rows,
+                num_rows,
+                device=x.device,
+                dtype=x.dtype,
+            )
+            output = self.out_proj(self.v_proj(x))
+        else:
+            q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+            k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+            v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+            q *= self.scaling
+            attn_weights = torch.einsum("icnhd,jcnhd->hcnij", q, k)
+            if self_attn_mask is not None:
+                raise NotImplementedError
+            if self_attn_padding_mask is not None:
+                attn_weights = attn_weights.masked_fill(
+                    self_attn_padding_mask.permute(2, 0, 1).unsqueeze(0).unsqueeze(3),
+                    -10000,
+                )
+            attn_probs = attn_weights.softmax(-1)
+            attn_probs = self.dropout_module(attn_probs)
+            context = torch.einsum("hcnij,jcnhd->icnhd", attn_probs, v)
+            context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
+            output = self.out_proj(context)
+        return output, attn_probs
+    def forward(
+            self,
+            x,
+            self_attn_mask=None,
+            self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        # if False and num_rows * num_cols > 2 ** 14 and not torch.is_grad_enabled():
+        if (num_rows * num_cols) > self.max_tokens_per_msa and not torch.is_grad_enabled():
+            return self._batched_forward(
+                x,
+                self_attn_mask,
+                self_attn_padding_mask,
+            )
+        else:
+            return self.compute_attention_update(x, self_attn_mask, self_attn_padding_mask)
+def utils_softmax(x, dim: int, onnx_trace: bool = False):
+    if onnx_trace:
+        return F.softmax(x.float(), dim=dim)
+    else:
+        return F.softmax(x, dim=dim, dtype=torch.float32)
+class FairseqIncrementalState(object):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.init_incremental_state()
+    def init_incremental_state(self):
+        self._incremental_state_id = str(uuid.uuid4())
+    def _get_full_incremental_state_key(self, key: str) -> str:
+        return "{}.{}".format(self._incremental_state_id, key)
+    def get_incremental_state(
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+            key: str,
+    ) -> Optional[Dict[str, Optional[Tensor]]]:
+        """Helper for getting incremental state for an nn.Module."""
+        full_key = self._get_full_incremental_state_key(key)
+        if incremental_state is None or full_key not in incremental_state:
+            return None
+        return incremental_state[full_key]
+    def set_incremental_state(
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+            key: str,
+            value: Dict[str, Optional[Tensor]],
+    ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
+        """Helper for setting incremental state for an nn.Module."""
+        if incremental_state is not None:
+            full_key = self._get_full_incremental_state_key(key)
+            incremental_state[full_key] = value
+        return incremental_state
+def with_incremental_state(cls):
+    cls.__bases__ = (FairseqIncrementalState,) + tuple(
+        b for b in cls.__bases__ if b != FairseqIncrementalState
+    )
+    return cls
+@with_incremental_state
+class LucaGPLMMultiheadAttention(nn.Module):
+    """Multi-headed attention.
+    See "Attention Is All You Need" for more details.
+    """
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            kdim=None,
+            vdim=None,
+            dropout=0.0,
+            bias=True,
+            add_bias_kv: bool = False,
+            add_zero_attn: bool = False,
+            self_attention: bool = False,
+            encoder_decoder_attention: bool = False,
+            use_rotary_embeddings: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+                self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+        self.onnx_trace = False
+        self.rot_emb = None
+        if use_rotary_embeddings:
+            self.rot_emb = RotaryEmbedding(dim=self.head_dim)
+        self.enable_torch_version = False
+        if hasattr(F, "multi_head_attention_forward"):
+            self.enable_torch_version = True
+        else:
+            self.enable_torch_version = False
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def reset_parameters(self):
+        '''
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+        '''
+        nn.init.xavier_uniform_(self.k_proj.weight, gain=nn.init.calculate_gain("relu"))
+        nn.init.xavier_uniform_(self.v_proj.weight, gain=nn.init.calculate_gain("relu"))
+        nn.init.xavier_uniform_(self.q_proj.weight, gain=nn.init.calculate_gain("relu"))
+        nn.init.xavier_uniform_(self.out_proj.weight, gain=nn.init.calculate_gain("relu"))
+        # nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(
+            self,
+            query,
+            key: Optional[Tensor],
+            value: Optional[Tensor],
+            key_padding_mask: Optional[Tensor] = None,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+            need_weights: bool = True,
+            static_kv: bool = False,
+            attn_mask: Optional[Tensor] = None,
+            before_softmax: bool = False,
+            need_head_weights: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if (
+                not self.rot_emb
+                and self.enable_torch_version
+                and not self.onnx_trace
+                and incremental_state is None
+                and not static_kv
+                # A workaround for quantization to work. Otherwise JIT compilation
+                # treats bias in linear module as method.
+                and not torch.jit.is_scripting()
+                and not need_head_weights
+        ):
+            assert key is not None and value is not None
+            return F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                torch.empty([0]),
+                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                self.training,
+                key_padding_mask,
+                need_weights,
+                attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj.weight,
+                k_proj_weight=self.k_proj.weight,
+                v_proj_weight=self.v_proj.weight,
+            )
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = LucaGPLMMultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        src_len = k.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask),
+                    ],
+                    dim=1,
+                )
+        if self.rot_emb:
+            q, k = self.rot_emb(q, k)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = LucaGPLMMultiheadAttention.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = utils_softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(
+            attn_weights_float.type_as(attn_weights),
+            p=self.dropout,
+            training=self.training,
+        )
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).type_as(attn).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights
+    @staticmethod
+    def _append_prev_key_padding_mask(
+            key_padding_mask: Optional[Tensor],
+            prev_key_padding_mask: Optional[Tensor],
+            batch_size: int,
+            src_len: int,
+            static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - prev_key_padding_mask.size(1)),
+                device=prev_key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), filler.float()], dim=1
+            )
+        elif key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - key_padding_mask.size(1)),
+                device=key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat([filler.float(), key_padding_mask.float()], dim=1)
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    @torch.jit.export
+    def reorder_incremental_state(
+            self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention and input_buffer_k.size(0) == new_order.size(
+                            0
+                    ):
+                        break
+                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+    def _get_input_buffer(
+            self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+    def _set_input_buffer(
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+    def apply_sparse_mask(attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+                keys_to_remove.append(k)
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][dim : 2 * dim]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+                    keys_to_remove.append(prefix + "in_proj_bias")
+        for k in keys_to_remove:
+            del state_dict[k]
+        for key, value in items_to_add.items():
+            state_dict[key] = value
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin):
+    cos = cos[:, : x.shape[-2], :]
+    sin = sin[:, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim: int, *_, **__):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=1):
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
+            self._seq_len_cached = seq_len
+            t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, :, :]
+            self._sin_cached = emb.sin()[None, :, :]
+        return self._cos_cached, self._sin_cached
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)
+        return (
+            apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
+            apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
+        )

multi_label_metrics.py ADDED Viewed

	@@ -0,0 +1,536 @@

+#!/usr/bin/env python
+# encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: sanyuan.**@**.com
+@tel: 137****6540
+@datetime: 2022/11/26 21:05
+@project: LucaOne
+@file: multi_label_metrics.py
+@desc: metrics for multi-label classification
+'''
+import csv
+import numpy as np
+import torch
+from sklearn.metrics import roc_auc_score, average_precision_score
+def multi_label_acc(targets, probs, threshold=0.5):
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    acc_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        intersection_len = len(set(target_relevant).intersection(set(pred_relevant)))
+        if union_len == 0:
+            acc_list.append(1.0)
+        else:
+            # acc
+            acc = 1.0 - (union_len - intersection_len) / targets.shape[1]
+            acc_list.append(acc)
+    return round(sum(acc_list)/len(acc_list), 6) if len(acc_list) > 0 else 0
+def multi_label_precision(targets, probs, threshold=0.5):
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    prec_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        target_len = len(target_relevant)
+        predict_len = len(pred_relevant)
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        intersection_len = len(set(target_relevant).intersection(set(pred_relevant)))
+        if union_len == 0:
+            prec_list.append(1.0)
+        else:
+            # precision
+            prec = 0.0
+            if predict_len > 0:
+                prec = intersection_len / predict_len
+            prec_list.append(prec)
+    round(sum(prec_list)/len(prec_list), 6) if len(prec_list) > 0 else 0
+def multi_label_recall(targets, probs, threshold=0.5):
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    recall_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        target_len = len(target_relevant)
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        intersection_len = len(set(target_relevant).intersection(set(pred_relevant)))
+        if union_len == 0:
+            recall_list.append(1.0)
+        else:
+            # recall
+            if target_len > 0:
+                recall = intersection_len / target_len
+            else:
+                recall = 1.0
+            recall_list.append(recall)
+    return round(sum(recall_list)/len(recall_list), 6) if len(recall_list) > 0 else 0
+def multi_label_jaccard(targets, probs, threshold=0.5):
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    jaccard_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        intersection_len = len(set(target_relevant).intersection(set(pred_relevant)))
+        if union_len == 0:
+            jaccard_list.append(1.0)
+        else:
+            # jaccard sim
+            jac = intersection_len / union_len
+            jaccard_list.append(jac)
+    return round(sum(jaccard_list)/len(jaccard_list), 6) if len(jaccard_list) > 0 else 0
+def multi_label_f1(targets, probs, threshold=0.5):
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    f1_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        target_len = len(target_relevant)
+        predict_len = len(pred_relevant)
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        intersection_len = len(set(target_relevant).intersection(set(pred_relevant)))
+        if union_len == 0:
+            f1_list.append(1.0)
+        else:
+            # precision
+            prec = 0.0
+            # recall
+            if target_len > 0:
+                recall = intersection_len / target_len
+            else:
+                recall = 1.0
+            # f1
+            if prec + recall == 0:
+                f1 = 0.0
+            else:
+                f1 = 2.0 * prec * recall / (prec + recall)
+            f1_list.append(f1)
+    return round(sum(f1_list)/len(f1_list), 6) if len(f1_list) > 0 else 0
+def multi_label_roc_auc(targets, probs, threshold=0.5):
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    roc_auc_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        if union_len == 0:
+            roc_auc_list.append(1.0)
+        else:
+            # roc_auc
+            if len(np.unique(targets[idx, :])) > 1:
+                roc_auc = roc_auc_macro(targets[idx, :], probs[idx, :])
+                roc_auc_list.append(roc_auc)
+    return round(sum(roc_auc_list)/len(roc_auc_list), 6) if len(roc_auc_list) > 0 else 0
+def multi_label_pr_auc(targets, probs, threshold=0.5):
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    pr_auc_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        if union_len == 0:
+            pr_auc_list.append(1.0)
+        else:
+            # roc_auc
+            if len(np.unique(targets[idx, :])) > 1:
+                pr_auc = pr_auc_macro(targets[idx, :], probs[idx, :])
+                pr_auc_list.append(pr_auc)
+    return round(sum(pr_auc_list)/len(pr_auc_list), 6) if len(pr_auc_list) > 0 else 0
+def metrics_multi_label(targets,  probs, threshold=0.5):
+    '''
+    metrics of multi-label classification
+    cal metrics for true matrix to predict probability matrix
+    :param targets: true 0-1 indicator matrix (n_samples, n_labels)
+    :param probs: probs 0~1 probability matrix (n_samples, n_labels)
+    :param threshold: negative-positive threshold
+    :return: some metrics
+    '''
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes((probs >= threshold).astype(int))
+    acc_list = []
+    prec_list = []
+    recall_list = []
+    jaccard_list = []
+    f1_list = []
+    roc_auc_list = []
+    pr_auc_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        target_len = len(target_relevant)
+        predict_len = len(pred_relevant)
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        intersection_len = len(set(target_relevant).intersection(set(pred_relevant)))
+        if union_len == 0:
+            acc_list.append(1.0)
+            prec_list.append(1.0)
+            recall_list.append(1.0)
+            roc_auc_list.append(1.0)
+            jaccard_list.append(1.0)
+            f1_list.append(1.0)
+            pr_auc_list.append(1.0)
+        else:
+            # acc
+            acc = 1.0 - (union_len - intersection_len) / targets.shape[1]
+            acc_list.append(acc)
+            # precision
+            prec = 0.0
+            if predict_len > 0:
+                prec = intersection_len / predict_len
+            prec_list.append(prec)
+            # recall
+            if target_len > 0:
+                recall = intersection_len / target_len
+            else:
+                recall = 1.0
+            recall_list.append(recall)
+            # jaccard sim
+            jac = intersection_len / union_len
+            jaccard_list.append(jac)
+            # f1
+            if prec + recall == 0:
+                f1 = 0.0
+            else:
+                f1 = 2.0 * prec * recall / (prec + recall)
+            f1_list.append(f1)
+            # roc_auc
+            if len(np.unique(targets[idx, :])) > 1:
+                roc_auc = roc_auc_macro(targets[idx, :], probs[idx, :])
+                roc_auc_list.append(roc_auc)
+                pr_auc = pr_auc_macro(targets[idx, :], probs[idx, :])
+                pr_auc_list.append(pr_auc)
+    f_max_value, p_max_value, r_max_value, t_max_value, preds_max_value = f_max(targets, probs)
+    return {
+        "acc": round(float(sum(acc_list)/len(acc_list)), 6) if len(acc_list) > 0 else 0,
+        "jaccard": round(float(sum(jaccard_list)/len(jaccard_list)), 6) if len(jaccard_list) > 0 else 0,
+        "prec": round(float(sum(prec_list)/len(prec_list)), 6) if len(prec_list) > 0 else 0,
+        "recall": round(float(sum(recall_list)/len(recall_list)), 6) if len(recall_list) > 0 else 0,
+        "f1": round(float(sum(f1_list)/len(f1_list)), 6) if len(f1_list) > 0 else 0,
+        "pr_auc": round(float(sum(pr_auc_list)/len(pr_auc_list)), 6) if len(pr_auc_list) > 0 else 0,
+        "roc_auc": round(float(sum(roc_auc_list)/len(roc_auc_list)), 6) if len(roc_auc_list) > 0 else 0,
+        "fmax": round(float(f_max_value), 6),
+        "pmax": round(float(p_max_value), 6) ,
+        "rmax": round(float(r_max_value), 6),
+        "tmax": round(float(t_max_value), 6)
+    }
+def f_max(targets, probs, gos=None):
+    '''
+    f-max for multi-label classification
+    :param targets: true 0-1 indicator matrix (n_samples, n_labels)
+    :param probs: probs 0~1 probability matrix (n_samples, n_labels)
+    :param gos:
+    :return: fmax, p_max(precision max）, r_max（recall max）, t_max（classificaton threshold）, preds_max（0-1 indicator matrix)
+    '''
+    preds_max = None
+    f_max = 0
+    p_max = 0
+    r_max = 0
+    t_max = 0
+    # from 0.01 to 1 (100 thresholds)
+    for tt in range(1, 101):
+        threshold = tt / 100.0
+        preds = (probs > threshold).astype(np.int32)
+        p = 0.0
+        r = 0.0
+        total = 0
+        p_total = 0
+        for i in range(preds.shape[0]):
+            tp = np.sum(preds[i, :] * targets[i, :])
+            fp = np.sum(preds[i, :]) - tp
+            fn = np.sum(targets[i, :]) - tp
+            if gos:
+                fn += gos[i]
+            if tp == 0 and fp == 0 and fn == 0:
+                continue
+            total += 1
+            if tp != 0:
+                p_total += 1
+                precision = tp / (1.0 * (tp + fp))
+                recall = tp / (1.0 * (tp + fn))
+                p += precision
+                r += recall
+        if total > 0 and p_total > 0:
+            r /= total
+            p /= p_total
+            if p + r > 0:
+                f = 2 * p * r / (p + r)
+                if f_max < f:
+                    f_max = f
+                    p_max = p
+                    r_max = r
+                    t_max = threshold
+                    preds_max = preds
+    return f_max, p_max, r_max, t_max, preds_max
+def metrics_multi_label_for_pred(targets,  preds, savepath=None):
+    '''
+    metrics for multi-label classification
+    cal metrics for true matrix to predict
+    :param targets: true 0-1 indicator matrix (n_samples, n_labels)
+    :param preds: preds 0~1 indicator matrix  (n_samples, n_labels)
+    :return: some metrics
+    '''
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes(preds)
+    acc_list = []
+    prec_list = []
+    recall_list = []
+    jaccard_list = []
+    f1_list = []
+    for idx in range(targets.shape[0]):
+        target_relevant = targets_relevant[idx]
+        pred_relevant = preds_relevant[idx]
+        target_len = len(target_relevant)
+        predict_len = len(pred_relevant)
+        union_len = len(set(target_relevant).union(set(pred_relevant)))
+        intersection_len = len(set(target_relevant).intersection(set(pred_relevant)))
+        acc = 1.0 - (union_len - intersection_len) / targets.shape[1]
+        prec = 0.0
+        if predict_len > 0:
+            prec = intersection_len / predict_len
+        recall = 0
+        if target_len > 0:
+            recall = intersection_len / target_len
+        else:
+            print(targets[idx])
+        jac = intersection_len / union_len
+        if prec + recall == 0:
+            f1 = 0.0
+        else:
+            f1 = 2.0 * prec * recall / (prec + recall)
+        acc_list.append(acc)
+        prec_list.append(prec)
+        recall_list.append(recall)
+        jaccard_list.append(jac)
+        f1_list.append(f1)
+    return {
+        "acc": round(sum(acc_list)/targets.shape[0], 6),
+        "jaccard": round(sum(jaccard_list)/targets.shape[0], 6),
+        "prec": round(sum(prec_list)/targets.shape[0], 6),
+        "recall": round(sum(recall_list)/targets.shape[0], 6),
+        "f1": round(sum(f1_list)/targets.shape[0], 6)
+    }
+def label_id_2_array(label_ids, label_size):
+    '''
+    building 0-1 indicator array for multi-label classification
+    :param label_ids:
+    :param label_size:
+    :return:
+    '''
+    arr = np.zeros(label_size)
+    arr[label_ids] = 1
+    return arr
+def relevant_indexes(matrix):
+    '''
+    Which positions in the multi-label are labeled as 1
+    :param matrix:
+    :return:
+    '''
+    if torch.is_tensor(matrix):
+        matrix = matrix.detach().cpu().numpy()
+    relevants = []
+    shape = matrix.shape
+    if matrix.ndim == 3:
+        for x in range(shape[0]):
+            relevant_x = []
+            for y in range(shape[1]):
+                relevant_y = []
+                for z in range(shape[2]):
+                    if matrix[x, y, z] == 1:
+                        relevant_y.append(int(z))
+                relevant_x.append(relevant_y)
+            relevants.append(relevant_x)
+    elif matrix.ndim == 2:
+        for row in range(shape[0]):
+            relevant = []
+            for col in range(shape[1]):
+                if matrix[row, col] == 1:
+                    relevant.append(int(col))
+            relevants.append(relevant)
+    else:
+        for idx in range(matrix.shape[0]):
+            if matrix[idx] == 1:
+                relevants.append(int(idx))
+    return relevants
+def irrelevant_indexes(matrix):
+    '''
+    Which positions in the multi-label label are 0
+    :param matrix:
+    :return:
+    '''
+    if torch.is_tensor(matrix):
+        matrix = matrix.detach().cpu().numpy()
+    irrelevants = []
+    if matrix.ndim == 3:
+        for x in range(matrix.shape[0]):
+            irrelevant_x = []
+            for y in range(matrix.shape[1]):
+                irrelevant_y = []
+                for z in range(matrix.shape[2]):
+                    if matrix[x, y, z] == 0:
+                        irrelevant_y.append(int(z))
+                irrelevant_x.append(irrelevant_y)
+            irrelevants.append(irrelevant_x)
+    elif matrix.ndim == 2:
+        for row in range(matrix.shape[0]):
+            irrelevant = []
+            for col in range(matrix.shape[1]):
+                if matrix[row, col] == 1:
+                    irrelevant.append(int(col))
+            irrelevants.append(irrelevant)
+    else:
+        for idx in range(matrix.shape[0]):
+            if matrix[idx] == 1:
+                irrelevants.append(int(idx))
+    return irrelevants
+def prob_2_pred(prob, threshold):
+    '''
+    Probabilities converted to 0-1 predicted labels
+    :param prob:
+    :param threshold:
+    :return:
+    '''
+    if torch.is_tensor(prob):
+        prob = prob.detach().cpu().numpy()
+    if isinstance(prob, (np.ndarray, np.generic)):
+        return (prob >= threshold).astype(int)
+def roc_auc_macro(target, prob):
+    '''
+    macro roc auc
+    :param target:
+    :param prob:
+    :return:
+    '''
+    return roc_auc_score(target, prob, average="macro")
+def pr_auc_macro(target, prob):
+    '''
+    macro pr-auc
+    :param target:
+    :param prob:
+    :return:
+    '''
+    return average_precision_score(target, prob, average="macro")
+def write_error_samples_multi_label(filepath, samples, input_indexs, input_id_2_names, output_id_2_name, targets,
+                                    probs, threshold=0.5,
+                                    use_other_diags=False, use_other_operas=False, use_checkin_department=False):
+    '''
+    writer bad cases for multi-label classification
+    :param filepath:
+    :param samples:
+    :param input_indexs:
+    :param input_id_2_names:
+    :param output_id_2_name:
+    :param targets:
+    :param probs:
+    :param threshold:
+    :param use_other_diags:
+    :param use_other_operas:
+    :param use_checkin_department:
+    :return:
+    '''
+    preds = prob_2_pred(probs, threshold=threshold)
+    targets_relevant = relevant_indexes(targets)
+    preds_relevant = relevant_indexes(preds)
+    with open(filepath, "w") as fp:
+        writer = csv.writer(fp)
+        writer.writerow(["score", "y_true", "y_pred", "inputs"])
+        for i in range(len(targets_relevant)):
+            target = set(targets_relevant[i])
+            pred = set(preds_relevant[i])
+            jacc = len(target.intersection(pred))/(len(target.union(pred)))
+            if output_id_2_name:
+                target_labels = [output_id_2_name[v] for v in target]
+                pred_labels = [output_id_2_name[v] for v in pred]
+            else:
+                target_labels = target
+                pred_labels = pred
+            sample = samples[i]
+            if input_id_2_names:
+                new_sample = []
+                for idx, input_index in enumerate(input_indexs):
+                    if input_index == 3 and not use_checkin_department:
+                        input_index = 12
+                    new_sample.append([input_id_2_names[idx][v] for v in sample[input_index]])
+                    if input_index == 6 and use_other_diags or input_index == 8 and use_other_operas or input_index == 10 and use_other_diags:
+                        new_sample.append([input_id_2_names[idx][v] for v in sample[input_index + 1]])
+            else:
+                new_sample = sample
+            row = [jacc, target_labels, pred_labels, new_sample]
+            writer.writerow(row)
+if __name__ == "__main__":
+    '''multi_label'''
+    probs = np.array([[0.6, 0.1, 0.1], [0.8, 0.3, 0.8], [0.8, 0.1, 0.1], [0.8, 0.1, 0.1]])
+    targets = np.array([[1, 1, 0], [1, 0, 1], [1, 0, 0], [0, 0, 1]])
+    print(metrics_multi_label(targets, probs))
+    t = np.array([[0, 0, 0], [1, 1, 1]])
+    print(t[0, :])
+    print(np.unique(t[0, :]))

pooling.py ADDED Viewed

	@@ -0,0 +1,301 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import torch
+import torch.nn as nn
+from .modeling_bert import BertEncoder, BertPooler
+class GlobalMaskMaxPooling1D(nn.Module):
+    def __init__(self, ):
+        super(GlobalMaskMaxPooling1D, self).__init__()
+    def forward(self, x, mask=None):
+        if mask is not None:
+            # (B, Seq_len) -> (B, Seq_len, 1)
+            mask = 1.0 - mask
+            mask = mask * (-2**10 + 1)
+            mask = torch.unsqueeze(mask, dim=-1)
+            x += mask
+        return torch.max(x, dim=1)[0]
+class GlobalMaskMinPooling1D(nn.Module):
+    def __init__(self, ):
+        super(GlobalMaskMinPooling1D, self).__init__()
+    def forward(self, x, mask=None):
+        if mask is not None:
+            # (B, Seq_len) -> (B, Seq_len, 1)
+            mask = 1.0 - mask
+            mask = mask * (2**10+1)
+            mask = torch.unsqueeze(mask, dim=-1)
+            x += mask
+        return torch.min(x, dim=1)[0]
+class GlobalMaskAvgPooling1D(nn.Module):
+    def __init__(self):
+        super(GlobalMaskAvgPooling1D, self).__init__()
+    def forward(self, x, mask=None):
+        if mask is not None:
+            # (B, Seq_len) -> (B, Seq_len, 1)
+            mask = torch.unsqueeze(mask, dim=-1)
+            x *= mask
+            return torch.sum(x, dim=1)/torch.sum(mask, dim=1)
+        else:
+            return torch.mean(x, dim=1)
+class GlobalMaskSumPooling1D(nn.Module):
+    def __init__(self, axis):
+        '''
+        sum pooling
+        :param axis: axis=0, add all the rows of the matrix，axis=1, add all the cols of the matrix
+        '''
+        super(GlobalMaskSumPooling1D, self).__init__()
+        self.axis = axis
+    def forward(self, x, mask=None):
+        if mask is not None:
+            # (B, Seq_len) -> (B, Seq_len, 1)
+            mask = torch.unsqueeze(mask, dim=-1)
+            x *= mask
+        return torch.sum(x, dim=self.axis)
+class GlobalMaskWeightedAttentionPooling1D(nn.Module):
+    def __init__(self, embed_size, use_bias=False):
+        super(GlobalMaskWeightedAttentionPooling1D, self).__init__()
+        self.embed_size = embed_size
+        self.use_bias = use_bias
+        self.W = nn.Parameter(torch.Tensor(self.embed_size))
+        nn.init.trunc_normal_(self.W, std=0.01)
+        if self.use_bias:
+            self.b = nn.Parameter(torch.Tensor(1))
+            nn.init.trunc_normal_(self.b, std=0.01)
+    def forward(self, x, mask=None):
+        # (B, Len, Embed) x (Embed,) = (B, Len)
+        logits = torch.matmul(x, self.W)
+        if self.use_bias:
+            logits += self.b
+        if mask is not None:
+            attention_probs = nn.Softmax(dim=-1)(logits + (1.0 - mask) * -10000)
+        else:
+            attention_probs = nn.Softmax(dim=-1)(logits)
+        x = torch.sum(torch.unsqueeze(attention_probs, dim=-1) * x, dim=1)
+        return x
+class GlobalMaskContextAttentionPooling1D(nn.Module):
+    def __init__(self, embed_size, units=None, use_additive_bias=False, use_attention_bias=False):
+        super(GlobalMaskContextAttentionPooling1D, self).__init__()
+        self.embed_size = embed_size
+        self.use_additive_bias = use_additive_bias
+        self.use_attention_bias = use_attention_bias
+        self.units = units if units else embed_size
+        self.U = nn.Parameter(torch.Tensor(self.embed_size, self.units))
+        self.V = nn.Parameter(torch.Tensor(self.embed_size, self.units))
+        if self.use_additive_bias:
+            self.b1 = nn.Parameter(torch.Tensor(self.units))
+            nn.init.trunc_normal_(self.b1, std=0.01)
+        if self.use_attention_bias:
+            self.b2 = nn.Parameter(torch.Tensor(1))
+            nn.init.trunc_normal_(self.b2, std=0.01)
+        self.c = nn.Parameter(torch.Tensor(self.units))
+        nn.init.trunc_normal_(self.U, std=0.01)
+        nn.init.trunc_normal_(self.V, std=0.01)
+        nn.init.trunc_normal_(self.c, std=0.01)
+    def forward(self, x, mask=None):
+        # (B, Len, Embed) x (Embed, Units) = (B, Len, Units)
+        q = torch.matmul(x, self.U)
+        k = torch.matmul(x, self.V)
+        if self.use_additive_bias:
+            h = torch.tanh(q + k + self.b1)
+        else:
+            h = torch.tanh(q + k)
+        if self.use_attention_bias:
+            e = torch.matmul(h, self.c) + self.b2
+        else:
+            e = torch.matmul(h, self.c)
+        if mask is not None:
+            attention_probs = nn.Softmax(dim=-1)(e + (1.0 - mask) * -10000)
+        else:
+            attention_probs = nn.Softmax(dim=-1)(e)
+        x = torch.sum(torch.unsqueeze(attention_probs, dim=-1) * x, dim=1)
+        return x
+class GlobalMaskValueAttentionPooling1D(nn.Module):
+    def __init__(self, embed_size, units=None, use_additive_bias=False, use_attention_bias=False):
+        super(GlobalMaskValueAttentionPooling1D, self).__init__()
+        self.embed_size = embed_size
+        self.use_additive_bias = use_additive_bias
+        self.use_attention_bias = use_attention_bias
+        self.units = units if units else embed_size
+        self.U = nn.Parameter(torch.Tensor(self.embed_size, self.units))
+        self.V = nn.Parameter(torch.Tensor(self.embed_size, self.units))
+        if self.use_additive_bias:
+            self.b1 = nn.Parameter(torch.Tensor(self.units))
+            nn.init.trunc_normal_(self.b1, std=0.01)
+        if self.use_attention_bias:
+            self.b2 = nn.Parameter(torch.Tensor(self.embed_size))
+            nn.init.trunc_normal_(self.b2, std=0.01)
+        self.W = nn.Parameter(torch.Tensor(self.units, self.embed_size))
+        nn.init.trunc_normal_(self.U, std=0.01)
+        nn.init.trunc_normal_(self.V, std=0.01)
+        nn.init.trunc_normal_(self.W, std=0.01)
+    def forward(self, x, mask=None):
+        # (B, Len, Embed) x (Embed, Units) = (B, Len, Units)
+        q = torch.matmul(x, self.U)
+        k = torch.matmul(x, self.V)
+        if self.use_additive_bias:
+            h = torch.tanh(q + k + self.b1)
+        else:
+            h = torch.tanh(q + k)
+        # (B, Len, Units) x (Units, Embed) = (B, Len, Embed)
+        if self.use_attention_bias:
+            e = torch.matmul(h, self.W) + self.b2
+        else:
+            e = torch.matmul(h, self.W)
+        if mask is not None:
+            attention_probs = nn.Softmax(dim=1)(e + torch.unsqueeze((1.0 - mask) * -10000, dim=-1))
+        else:
+            attention_probs = nn.Softmax(dim=1)(e)
+        x = torch.sum(attention_probs * x, dim=1)
+        return x
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' + str(self.embed_size) + ' -> ' + str(self.embed_size) + ')'
+class GlobalMaskTransformerPooling1D(nn.Module):
+    def __init__(self, config):
+        super(GlobalMaskTransformerPooling1D, self).__init__()
+        self.embeddings = nn.Parameter(torch.Tensor(1, 1, config.hidden_size))
+        nn.init.trunc_normal_(self.embeddings, std=0.02)
+        config.num_hidden_layers = 2
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+    def forward(self, x, mask=None):
+        B, Seq_len, Enbed = x.size()
+        cls_emb_batch = self.embeddings.expand(B, 1, Enbed)
+        merged_output = torch.cat((cls_emb_batch, x), dim=1) # [B, Seq_len + 1, Enbed]
+        if mask is not None:
+            device = x.device
+            cls_mask = torch.ones(B, 1).to(device)
+            mask = torch.cat([cls_mask, mask], dim=1)
+            mask = mask[:, None, None, :]
+        sequence_output = self.encoder(merged_output,
+                                       attention_mask=mask,
+                                       head_mask=None,
+                                       encoder_hidden_states=None,
+                                       encoder_attention_mask=None,
+                                       output_attentions=False,
+                                       output_hidden_states=False,
+                                       return_dict=False)[0]
+        pooled_output = self.pooler(sequence_output)
+        return pooled_output
+class GlobalMaxPool1d(nn.Module):
+    def __init__(self):
+        super(GlobalMaxPool1d,self).__init__()
+        self.fc = nn.AdaptiveMaxPool1d(1)
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+        x = self.fc(x)
+        x = torch.squeeze(x, dim=-1)
+        return x
+class GlobalAvgPool1d(nn.Module):
+    def __init__(self, ):
+        super(GlobalAvgPool1d, self).__init__()
+        self.fc = nn.AdaptiveAvgPool1d(1)
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+        x = self.fc(x)
+        x = torch.squeeze(x, dim=-1)
+        return x
+class AttentionPool1d(nn.Module):
+    def __init__(self, embed_size, device="cuda"):
+        super(AttentionPool1d, self).__init__()
+        self.embed_size = embed_size
+        self.W = nn.Parameter(torch.Tensor(self.embed_size, self.embed_size))
+        self.b = nn.Parameter(torch.Tensor(self.embed_size))
+        self.c = nn.Parameter(torch.Tensor(self.embed_size))
+        nn.init.trunc_normal_(self.W, std=0.02)
+        nn.init.trunc_normal_(self.b, std=0.02)
+        nn.init.trunc_normal_(self.c, std=0.02)
+    def forward(self, x):
+        '''
+        # x：(B, Seq_len, Enbed)
+        # mul: (B, Seq_len)
+        mul = torch.matmul(x, self.w)
+        # B, Seq_len
+        attention_probs = nn.Softmax(dim=-1)(mul)
+        # B, Seq_len
+        x = torch.sum(torch.unsqueeze(attention_probs, dim=-1) * x, dim=1)
+        '''
+        mul = torch.tanh(torch.matmul(x, self.W) + self.b)
+        mul = torch.matmul(mul, self.c)
+        attention_probs = nn.Softmax(dim=-1)(mul)
+        x = torch.sum(torch.unsqueeze(attention_probs, dim=-1) * x, dim=1)
+        return x
+class TransformerPool1d(nn.Module):
+    def __init__(self, config, embeddings, embed_size, num_transformer_layers=2, CLS_ID=102, device="cuda"):
+        super(TransformerPool1d, self).__init__()
+        if embeddings:
+            self.embeddings = embeddings
+        else:
+            self.embeddings = nn.Parameter(torch.Tensor(1, 1, embed_size))
+            nn.init.trunc_normal_(self.embeddings, std=0.02)
+            # self.embeddings = BertEmbeddings(config)
+        self.CLS_ID = CLS_ID
+        self.device = device
+        config.num_hidden_layers = num_transformer_layers
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+    def forward(self, x):
+        # x：(B, Seq_len, Enbed)
+        B, Seq_len, Enbed = x.size()
+        #cls_emb_batch = self.embeddings(torch.tensor([[self.CLS_ID]] * x.size()[0], dtype=torch.long).to(self.device)) # B, 1
+        cls_emb_batch = self.embeddings.expand(B, 1, Enbed)
+        merged_output = torch.cat((cls_emb_batch, x), dim=1) # [B, Seq_len + 1, Enbed]
+        sequence_output = self.encoder(merged_output,
+                                       attention_mask=None,
+                                       head_mask=None,
+                                       encoder_hidden_states=None,
+                                       encoder_attention_mask=None,
+                                       output_attentions=False,
+                                       output_hidden_states=False,
+                                       return_dict=False)[0]
+        pooled_output = self.pooler(sequence_output)
+        return pooled_output

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:234ed601e664ca2e736f2427dfb8544b47370f641bbd82612297efca3943892a
+size 6320919985

regression_loss.py ADDED Viewed

	@@ -0,0 +1,238 @@

+#!/usr/bin/env python
+# encoding: utf-8
+'''
+@license: (C) Copyright 2021, Hey.
+@author: Hey
+@email: [email protected]
+@tel: 137****6540
+@datetime: 2023/6/15 22:53
+@project: LucaOne
+@file: regression_loss.py
+@desc: regression loss
+'''
+import warnings
+import numpy as np
+import torch
+import torch.nn as nn
+from statsmodels.stats.stattools import durbin_watson
+from .masked_loss import _MaskedLoss
+def nanstd(input, dim=None, keepdim=False):
+    mu = torch.nanmean(input, dim=dim, keepdim=True)
+    return torch.sqrt(torch.nanmean((input - mu)**2, dim=dim, keepdim=keepdim))
+def iqr(batch, dim=None, reduction='mean'):
+    if dim is None:
+        if len(batch.shape) == 1:
+            dim = 0
+        else:
+            dim = 1
+    if isinstance(batch, np.ndarray):
+        out = np.quantile(batch, 0.75, axis=dim) - \
+              np.quantile(batch, 0.25, axis=dim)
+    elif isinstance(batch, torch.Tensor):
+        out = torch.quantile(batch, 0.75, dim=dim) - \
+              torch.quantile(batch, 0.25, dim=dim)
+    if reduction == 'none':
+        return out
+    elif reduction == 'mean':
+        return out.mean()
+    else:
+        raise NotImplementedError
+def naniqr(batch, dim=None, reduction='none'):
+    if dim is None:
+        if len(batch.shape) == 1:
+            dim = 0
+        else:
+            dim = 1
+    if isinstance(batch, np.ndarray):
+        out = np.nanquantile(batch, 0.75, axis=dim) - \
+              np.nanquantile(batch, 0.25, axis=dim)
+    elif isinstance(batch, torch.Tensor):
+        out = torch.nanquantile(batch, 0.75, dim=dim) - \
+              torch.nanquantile(batch, 0.25, dim=dim)
+    if reduction == 'none':
+        return out
+    elif reduction == 'mean':
+        return out.mean()
+    elif reduction == 'nanmean':
+        return torch.nanmean(out)
+    else:
+        raise NotImplementedError
+def compute_dw(res, dim=1, replace_missing=0., reduction='none'):
+    """Durbin-Watson statistics
+    https://www.statsmodels.org/devel/generated/statsmodels.stats.stattools.durbin_watson.html
+    """
+    if isinstance(res, torch.Tensor):
+        res = res.detach().cpu().numpy()
+    if replace_missing is not None:
+        res = res.copy()
+        res[np.isnan(res)] = replace_missing
+    out = durbin_watson(res, axis=dim)
+    if reduction == 'mean':
+        return out.mean()
+    elif reduction == 'none':
+        return out
+    elif reduction == 'median':
+        return np.median(out)
+def estimate_noise(x, dim=1, window_size=10, step=5, reduce='nanmean', keepdim=True):
+    noises = nanstd(x.unfold(dim, window_size, step), -1, keepdim=False)
+    if reduce == 'nanmedian':
+        return noises.nanmedian(dim, keepdim=keepdim).values
+    if reduce == 'nanmean':
+        return noises.nanmean(dim, keepdim=keepdim)
+    if reduce == 'median':
+        return noises.median(dim, keepdim=keepdim).values
+    if reduce == 'mean':
+        return noises.mean(dim, keepdim=keepdim)
+    if reduce == 'none':
+        return noises
+    raise ValueError
+class MaskedMSELoss(_MaskedLoss):
+    """Masked MSE loss"""
+    def __init__(self, reduction='mean', ignore_nans=True, ignore_value=-100.0):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = nn.MSELoss(reduction='none')
+class MaskedL1Loss(_MaskedLoss):
+    """Masked L1 loss."""
+    def __init__(self, reduction='mean', ignore_nans=True, ignore_value=-100.0):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = nn.L1Loss(reduction='none')
+class MaskedHuberLoss(_MaskedLoss):
+    """Masked L1 loss."""
+    def __init__(self, reduction='mean', ignore_nans=True, delta=1, ignore_value=-100.0):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = nn.HuberLoss(reduction='none', delta=delta)
+class IQRLoss(nn.Module):
+    "IQR of the residuals"
+    def __init__(self, reduction='nanmean', ignore_nans=True, ignore_value=-100.0):
+        super().__init__()
+        self.reduction = reduction
+        self.ignore_nans = ignore_nans
+        self.ignore_value = ignore_value
+    def forward(self, input, target=0.):
+        if isinstance(target, torch.Tensor) and not (target.size() == input.size()):
+            warnings.warn(
+                "Using a target size ({}) that is different to the input size ({}). "
+                "This will likely lead to incorrect results due to broadcasting. "
+                "Please ensure they have the same size.".format(
+                    target.size(), input.size()),
+                stacklevel=2,
+            )
+        if self.ignore_nans:
+            return naniqr(target-input, reduction=self.reduction)
+        else:
+            return iqr(target-input, reduction=self.reduction)
+class MaskedLogCoshLoss(_MaskedLoss):
+    def __init__(self, reduction='mean', ignore_nans=True,  ignore_value=-100.0):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = LogCoshLoss(reduction='none')
+class MaskedXTanhLoss(_MaskedLoss):
+    def __init__(self, reduction='mean', ignore_nans=True,  ignore_value=-100.0):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = XTanhLoss(reduction='none')
+class MaskedXSigmoidLoss(_MaskedLoss):
+    def __init__(self, reduction='mean', ignore_nans=True,  ignore_value=-100.0):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = XSigmoidLoss(reduction='none')
+class MaskedAlgebraicLoss(_MaskedLoss):
+    def __init__(self, reduction='mean', ignore_nans=True,  ignore_value=-100.0):
+        super().__init__(reduction=reduction, ignore_nans=ignore_nans, ignore_value=ignore_value)
+        self.criterion = AlgebraicLoss(reduction='none')
+class LogCoshLoss(torch.nn.Module):
+    def __init__(self, reduction='none'):
+        super().__init__()
+        self.reduction = reduction
+    def forward(self, input, target):
+        diff = input - target
+        if self.reduction == 'mean':
+            return torch.mean(torch.log(torch.cosh(diff + 1e-12)))
+        elif self.reduction == 'sum':
+            return torch.sum(torch.log(torch.cosh(diff + 1e-12)))
+        else:
+            return torch.log(torch.cosh(diff + 1e-12))
+class XTanhLoss(torch.nn.Module):
+    def __init__(self, reduction='none'):
+        super().__init__()
+        self.reduction = reduction
+    def forward(self, input, target):
+        diff = input - target
+        if self.reduction == 'mean':
+            return torch.mean(diff * torch.tanh(diff))
+        elif self.reduction == 'sum':
+            return torch.sum(diff * torch.tanh(diff))
+        else:
+            return diff * torch.tanh(diff)
+class XSigmoidLoss(torch.nn.Module):
+    def __init__(self, reduction='none'):
+        super().__init__()
+        self.reduction = reduction
+    def forward(self, input, target):
+        diff = input - target
+        if self.reduction == 'mean':
+            return torch.mean(2 * diff * torch.sigmoid(diff) - diff)
+        elif self.reduction == 'sum':
+            return torch.sum(2 * diff * torch.sigmoid(diff) - diff)
+        else:
+            return 2 * diff * torch.sigmoid(diff) - diff
+class AlgebraicLoss(torch.nn.Module):
+    def __init__(self, reduction='none'):
+        super().__init__()
+        self.reduction = reduction
+    def forward(self, input, target):
+        diff = input - target
+        if self.reduction == 'mean':
+            return torch.mean(diff * diff / torch.sqrt(1 + diff * diff))
+        elif self.reduction == 'sum':
+            return torch.sum(diff * diff / torch.sqrt(1 + diff * diff))
+        else:
+            return diff * diff / torch.sqrt(1 + diff * diff)
+if __name__ == "__main__":
+    import torch
+    label = torch.Tensor([[[1], [1], [-100]], [[1], [-100], [0]]])
+    pred = torch.Tensor([[[2], [1], [3]], [[2], [1], [3]]])
+    loss = MaskedMSELoss(reduction="mean", ignore_nans=True, ignore_value=-100.0)
+    print("loss:")
+    print(loss(pred, label))

utils.py ADDED Viewed

	@@ -0,0 +1,979 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import math
+import os, csv, json
+import io, textwrap, itertools
+import subprocess
+from Bio import SeqIO
+import torch
+import numpy as np
+import sys, random
+from sklearn.metrics import confusion_matrix
+import matplotlib.pyplot as plt
+import pynvml, requests
+from collections import OrderedDict
+plt.rcParams.update({'font.size': 18})
+plt.rcParams['axes.unicode_minus'] = False
+from .file_operator import file_reader
+from .multi_label_metrics import prob_2_pred, relevant_indexes, metrics_multi_label
+from .metrics import metrics_multi_class, metrics_binary, metrics_regression
+common_nucleotide_set = {'A', 'T', 'C', 'G', 'U', 'N'}
+# not {'O', 'U', 'Z', 'J', 'B'}
+# Common amino acids
+common_amino_acid_set = {'R', 'X', 'S', 'G', 'W', 'I', 'Q', 'A', 'T', 'V', 'K', 'Y', 'C', 'N', 'L', 'F', 'D', 'M', 'P', 'H', 'E'}
+def to_device(device, batch):
+    '''
+    input to device
+    :param device:
+    :param batch:
+    :return:
+    '''
+    new_batch = {}
+    sample_num = 0
+    tens = None
+    for item1 in batch.items():
+        new_batch[item1[0]] = {}
+        if isinstance(item1[1], dict):
+            for item2 in item1[1].items():
+                new_batch[item1[0]][item2[0]] = {}
+                if isinstance(item2[1], dict):
+                    for item3 in item2[1].items():
+                        if item3[1] is not None and not isinstance(item3[1], int) and not isinstance(item3[1], str) and not isinstance(item3[1], float):
+                            new_batch[item1[0]][item2[0]][item3[0]] = item3[1].to(device)
+                            tens = item3[1]
+                        else:
+                            new_batch[item1[0]][item2[0]][item3[0]] = item3[1]
+                else:
+                    if item2[1] is not None and not isinstance(item2[1], int) and not isinstance(item2[1], str) and not isinstance(item2[1], float):
+                        new_batch[item1[0]][item2[0]] = item2[1].to(device)
+                        tens = item2[1]
+                    else:
+                        new_batch[item1[0]][item2[0]] = item2[1]
+        else:
+            if item1[1] is not None and not isinstance(item1[1], int) and not isinstance(item1[1], str) and not isinstance(item1[1], float):
+                new_batch[item1[0]] = item1[1].to(device)
+                tens = item1[1]
+            else:
+                new_batch[item1[0]] = item1[1]
+    if tens is not None:
+        sample_num = tens.shape[0]
+    return new_batch, sample_num
+def get_parameter_number(model):
+    '''
+    colc the parameter number of the model
+    :param model:
+    :return:
+    '''
+    param_size = 0
+    param_sum = 0
+    trainable_size = 0
+    trainable_num = 0
+    for param in model.parameters():
+        cur_size = param.nelement() * param.element_size()
+        cur_num = param.nelement()
+        param_size += cur_size
+        param_sum += cur_num
+        if param.requires_grad:
+            trainable_size += cur_size
+            trainable_num += cur_num
+    buffer_size = 0
+    buffer_sum = 0
+    for buffer in model.buffers():
+        buffer_size += buffer.nelement() * buffer.element_size()
+        buffer_sum += buffer.nelement()
+    '''
+    total_num = sum(p.numel() for p in model.parameters())
+    total_size = sum(p.numel() * p.element_size() for p in model.parameters())
+    total_num += sum(p.numel() for p in model.buffers())
+    total_size += sum(p.numel() * p.element_size() for p in model.buffers())
+    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    trainable_size = sum(p.numel() * p.element_size() for p in model.parameters() if p.requires_grad)
+    '''
+    return {
+        'total_num': "%fM" % round((buffer_sum + param_sum)/(1024 * 1024), 2),
+        'total_size': "%fMB" % round((buffer_size + param_size)/(1024 * 1024), 2),
+        'param_sum': "%fM" % round(param_sum/(1024 * 1024), 2),
+        'param_size': "%fMB" % round(param_size/(1024 * 1024), 2),
+        'buffer_sum': "%fM" % round(buffer_sum/(1024 * 1024), 2),
+        'buffer_size': "%fMB" % round(buffer_size/(1024 * 1024), 2),
+        'trainable_num': "%fM" % round(trainable_num/(1024 * 1024), 10),
+        'trainable_size': "%fMB" % round(trainable_size/(1024 * 1024), 10)
+    }
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+def label_id_2_label_name(output_mode, label_list, prob, threshold=0.5):
+    '''
+    convect label id to label name
+    :param output_mode:
+    :param label_list:
+    :param prob:
+    :param threshold:
+    :return:
+    '''
+    if output_mode in ["multi-label", "multi_label"]:
+        res = []
+        pred = prob_2_pred(prob, threshold)
+        pred_index = relevant_indexes(pred)
+        for row in range(prob.shape[0]):
+            label_names = [label_list[idx] for idx in pred_index[row]]
+            res.append(label_names)
+        return res
+    elif output_mode in ["multi-class", "multi_class"]:
+        pred = np.argmax(prob, axis=1)
+        label_names = [label_list[idx] for idx in pred]
+        return label_names
+    elif output_mode in ["binary-class", "binary_class"]:
+        if prob.ndim == 2:
+            prob = prob.flatten(order="C")
+        pred = prob_2_pred(prob, threshold)
+        label_names = [label_list[idx] for idx in pred]
+        return label_names
+    else:
+        raise KeyError(output_mode)
+def plot_bins(data, xlabel, ylabel, bins, filepath):
+    '''
+    plot bins
+    :param data:
+    :param xlabel:
+    :param ylabel:
+    :param bins: bins number
+    :param filepath: png save filepath
+    :return:
+    '''
+    plt.figure(figsize=(40, 20), dpi=100)
+    plt.hist(data, bins=bins)
+    # plt.xticks(range(min(data), max(data)))
+    # plt.grid(linestyle='--', alpha=0.5)
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    if filepath is None:
+        plt.show()
+    else:
+        plt.savefig(filepath)
+        plt.clf()
+    plt.close()
+def plot_confusion_matrix_for_binary_class(targets, preds, cm=None, savepath=None):
+    '''
+    :param targets: ground truth
+    :param preds: prediction probs
+    :param cm: confusion matrix
+    :param savepath: confusion matrix picture savepth
+    '''
+    plt.figure(figsize=(40, 20), dpi=100)
+    if cm is None:
+        cm = confusion_matrix(targets, preds, labels=[0, 1])
+    plt.matshow(cm, cmap=plt.cm.Oranges)
+    plt.colorbar()
+    for x in range(len(cm)):
+        for y in range(len(cm)):
+            plt.annotate(cm[x, y], xy=(y, x), verticalalignment='center', horizontalalignment='center')
+    plt.ylabel('True')
+    plt.xlabel('Prediction')
+    if savepath:
+        plt.savefig(savepath, dpi=100)
+    else:
+        plt.show()
+    plt.close("all")
+def save_labels(filepath, label_list):
+    '''
+    save labels
+    :param filepath:
+    :param label_list:
+    :return:
+    '''
+    with open(filepath, "w") as wfp:
+        wfp.write("label" + "\n")
+        for label in label_list:
+            wfp.write(label + "\n")
+def load_labels(filepath, header=True):
+    '''
+    load labels
+    :param filepath:
+    :param header: where the file has header or not
+    :return:
+    '''
+    label_list = []
+    with open(filepath, "r") as rfp:
+        for label in rfp:
+            label_list.append(label.strip())
+    if len(label_list) > 0 and (header or label_list[0] == "label"):
+        return label_list[1:]
+    return label_list
+def load_vocab(vocab_path):
+    '''
+    load vocab
+    :param vocab_path:
+    :return:
+    '''
+    vocab = {}
+    with open(vocab_path, "r") as rfp:
+        for line in rfp:
+            v = line.strip()
+            vocab[v] = len(vocab)
+    return vocab
+def subprocess_popen(statement):
+    '''
+    execute shell cmd
+    :param statement:
+    :return:
+    '''
+    p = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE)
+    while p.poll() is None:
+        if p.wait() != 0:
+            print("fail.")
+            return False
+        else:
+            re = p.stdout.readlines()
+            result = []
+            for i in range(len(re)):
+                res = re[i].decode('utf-8').strip('\r\n')
+                result.append(res)
+            return result
+def prepare_inputs(input_type, embedding_type, batch):
+    if input_type == "sequence":
+        inputs = {
+            "input_ids_a": batch[0],
+            "attention_mask_a": batch[1],
+            "token_type_ids_a": batch[2],
+            "input_ids_b": batch[4],
+            "attention_mask_b": batch[5],
+            "token_type_ids_b": batch[6],
+            "labels": batch[-1]
+        }
+    elif input_type == "embedding":
+        if embedding_type not in ["vector", "bos"]:
+            inputs = {
+                "embedding_info_a": batch[0],
+                "embedding_attention_mask_a": batch[1],
+                "embedding_info_b": batch[2],
+                "embedding_attention_mask_b": batch[3],
+                "labels": batch[-1]
+            }
+        else:
+            inputs = {
+                "embedding_info_a": batch[0],
+                "embedding_attention_mask_a": None,
+                "embedding_info_b": batch[1],
+                "embedding_attention_mask_b": None,
+                "labels": batch[-1]
+            }
+    elif input_type == "structure":
+        inputs = {
+            "struct_input_ids_a": batch[0],
+            "struct_contact_map_a": batch[1],
+            "struct_input_ids_b": batch[2],
+            "struct_contact_map_b": batch[3],
+            "labels": batch[-1]
+        }
+    elif input_type == "sefn":
+        if embedding_type not in ["vector", "bos"]:
+            inputs = {
+                "input_ids_a": batch[0],
+                "attention_mask_a": batch[1],
+                "token_type_ids_a": batch[2],
+                "embedding_info_a": batch[4],
+                "embedding_attention_mask_a": batch[5],
+                "input_ids_b": batch[6],
+                "attention_mask_b": batch[7],
+                "token_type_ids_b": batch[8],
+                "embedding_info_b": batch[10],
+                "embedding_attention_mask_b": batch[11],
+                "labels": batch[-1],
+            }
+        else:
+            inputs = {
+                "input_ids_a": batch[0],
+                "attention_mask_a": batch[1],
+                "token_type_ids_a": batch[2],
+                "embedding_info_a": batch[4],
+                "embedding_attention_mask_a": None,
+                "input_ids_b": batch[5],
+                "attention_mask_b": batch[6],
+                "token_type_ids_b": batch[7],
+                "embedding_info_b": batch[9],
+                "embedding_attention_mask_b": None,
+                "labels": batch[-1],
+            }
+    elif input_type == "ssfn":
+        inputs = {
+            "input_ids_a": batch[0],
+            "attention_mask_a": batch[1],
+            "token_type_ids_a": batch[2],
+            "struct_input_ids_a": batch[4],
+            "struct_contact_map_a": batch[5],
+            "input_ids_b": batch[6],
+            "attention_mask_b": batch[7],
+            "token_type_ids_b": batch[8],
+            "struct_input_ids_b": batch[10],
+            "struct_contact_map_b": batch[11],
+            "labels": batch[-1]
+        }
+    else:
+        inputs = None
+    return inputs
+def gene_seq_replace_re(seq):
+    '''
+    Nucleic acid 还原
+    :param seq:
+    :return:
+    '''
+    new_seq = ""
+    for ch in seq:
+        if ch == '1':
+            new_seq += "A"
+        elif ch == '2':
+            new_seq += "T"
+        elif ch == '3':
+            new_seq += "C"
+        elif ch == '4':
+            new_seq += "G"
+        else: # unknown
+            new_seq += "N"
+    return new_seq
+def gene_seq_replace(seq):
+    '''
+    Nucleic acid （gene replace: A->1, U/T->2, C->3, G->4, N->5
+    :param seq:
+    :return:
+    '''
+    new_seq = ""
+    for ch in seq:
+        if ch in ["A", "a"]:
+            new_seq += "1"
+        elif ch in ["T", "U", "t", "u"]:
+            new_seq += "2"
+        elif ch in ["C", "c"]:
+            new_seq += "3"
+        elif ch in ["G", "g"]:
+            new_seq += "4"
+        else: # unknown
+            new_seq += "5"
+    return new_seq
+def get_labels(label_filepath, header=True):
+    '''
+    get labels from file, exists header
+    :param label_filepath:
+    :param header:
+    :return:
+    '''
+    with open(label_filepath, "r") as fp:
+        labels = []
+        multi_cols = False
+        cnt = 0
+        for line in fp:
+            line = line.strip()
+            cnt += 1
+            if cnt == 1 and (header or line == "label"):
+                if line.find(",") > 0:
+                    multi_cols = True
+                continue
+            if multi_cols:
+                idx = line.find(",")
+                if idx > 0:
+                    label_name = line[idx + 1:].strip()
+                else:
+                    label_name = line
+            else:
+                label_name = line
+            labels.append(label_name)
+        return labels
+def available_gpu_id():
+    '''
+    计算可用的GPU id
+    :return:
+    '''
+    pynvml.nvmlInit()
+    if not torch.cuda.is_available():
+        print("GPU not available")
+        return -1
+    # 获取GPU数量
+    device_count = pynvml.nvmlDeviceGetCount()
+    max_available_gpu = -1
+    max_available_rate = 0
+    # 遍历所有GPU并检查可用性
+    for i in range(device_count):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+        memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
+        # 假设如果GPU利用率小于某个阈值（例如10%），我们认为这个GPU目前是空闲的
+        if utilization.gpu < 10 and max_available_rate < 100 - utilization.gpu:
+            max_available_rate = 100 - utilization.gpu
+            max_available_gpu = i
+    # 打印可用的GPU ID
+    if max_available_gpu > -1:
+        print("Available GPU ID: %d, Free Rate: %0.2f%%" % (max_available_gpu, max_available_rate))
+    else:
+        print("No Available GPU!")
+    # Shutdown NVML
+    pynvml.nvmlShutdown()
+    return max_available_gpu
+def eval_metrics(output_mode, truths, preds, threshold=0.5):
+    '''
+    eval metrics
+    :param output_mode:
+    :param truths:
+    :param preds:
+    :param threshold:
+    :return:
+    '''
+    print("\ntruths size: ", truths.shape)
+    print("\npreds size: ", preds.shape)
+    if output_mode in ["multi-label", "multi_label"]:
+        return metrics_multi_label(truths, preds, threshold=threshold)
+    elif output_mode in ["multi-class", "multi_class"]:
+        return metrics_multi_class(truths, preds)
+    elif output_mode == "regression":
+        return metrics_regression(truths, preds)
+    elif output_mode in ["binary-class", "binary_class"]:
+        return metrics_binary(truths, preds, threshold=threshold)
+    else:
+        raise Exception("Not Support this output mode: %s" % output_mode)
+def load_trained_model(model_config, args, model_class, model_dirpath):
+    # load exists checkpoint
+    print("load pretrained model: %s" % model_dirpath)
+    try:
+        model = model_class.from_pretrained(model_dirpath, args=args)
+    except Exception as e:
+        model = model_class(model_config, args=args)
+        pretrained_net_dict = torch.load(os.path.join(args.model_dirpath, "pytorch.pth"),
+                                         map_location=torch.device("cpu"))
+        model_state_dict_keys = set()
+        for key in model.state_dict():
+            model_state_dict_keys.add(key)
+        new_state_dict = OrderedDict()
+        for k, v in pretrained_net_dict.items():
+            if k.startswith("module."):
+                # remove `module.`
+                name = k[7:]
+            else:
+                name = k
+            if name in model_state_dict_keys:
+                new_state_dict[name] = v
+        # print("diff:")
+        # print(model_state_dict_keys.difference(new_state_dict.keys()))
+        model.load_state_dict(new_state_dict)
+    return model
+def clean_seq(protein_id, seq, return_rm_index=False):
+    seq = seq.upper()
+    new_seq = ""
+    has_invalid_char = False
+    invalid_char_set = set()
+    return_rm_index_set = set()
+    for idx, ch in enumerate(seq):
+        if 'A' <= ch <= 'Z' and ch not in ['J']:
+            new_seq += ch
+        else:
+            invalid_char_set.add(ch)
+            return_rm_index_set.add(idx)
+            has_invalid_char = True
+    if has_invalid_char:
+        print("id: %s. Seq: %s" % (protein_id, seq))
+        print("invalid char set:", invalid_char_set)
+        print("return_rm_index:", return_rm_index_set)
+    if return_rm_index:
+        return new_seq, return_rm_index_set
+    return new_seq
+def sample_size(data_dirpath):
+    if os.path.isdir(data_dirpath):
+        new_filepaths = []
+        for filename in os.listdir(data_dirpath):
+            if not filename.startswith("."):
+                new_filepaths.append(os.path.join(data_dirpath, filename))
+        filepaths = new_filepaths
+    else:
+        filepaths = [data_dirpath]
+    total = 0
+    for filepath in filepaths:
+        header = filepath.endswith(".tsv") or filepath.endswith(".csv")
+        print("sample_size filepath: %s" % filepath)
+        for _ in file_reader(filepath, header=header, header_filter=True):
+            total += 1
+    return total
+def writer_info_tb(tb_writer, logs, global_step, prefix=None):
+    '''
+    write info to tensorboard
+    :param tb_writer:
+    :param logs:
+    :param global_step:
+    :param prefix:
+    :return:
+    '''
+    for key, value in logs.items():
+        if isinstance(value, dict):
+            '''
+            for key1, value1 in value.items():
+                tb_writer.add_scalar(key + "_" + key1, value1, global_step)
+            '''
+            writer_info_tb(tb_writer, value, global_step, prefix=key)
+        elif not math.isnan(value) and not math.isinf(value):
+            tb_writer.add_scalar(prefix + "_" + key if prefix else key, value, global_step)
+        else:
+            print("writer_info_tb NaN or Inf, Key-Value: %s=%s" % (key, value))
+def get_lr(optimizer):
+    '''
+    get learning rate
+    :param optimizer:
+    :return:
+    '''
+    for p in optimizer.param_groups:
+        if "lr" in p:
+            return p["lr"]
+def metrics_merge(results, all_results):
+    '''
+    merge metrics
+    :param results:
+    :param all_results:
+    :return:
+    '''
+    for item1 in results.items():
+        if item1[0] not in all_results:
+            all_results[item1[0]] = {}
+        for item2 in item1[1].items():
+            if item2[0] not in all_results[item1[0]]:
+                all_results[item1[0]][item2[0]] = {}
+            for item3 in item2[1].items():
+                if item3[0] not in all_results[item1[0]][item2[0]]:
+                    all_results[item1[0]][item2[0]][item3[0]] = item3[1]
+                else:
+                    all_results[item1[0]][item2[0]][item3[0]] += item3[1]
+    return all_results
+def print_shape(item):
+    '''
+    print shape
+    :param item:
+    :return:
+    '''
+    if isinstance(item, dict):
+        for item1 in item.items():
+            print(item1[0] + ":")
+            print_shape(item1[1])
+    elif isinstance(item, list):
+        for idx, item1 in enumerate(item):
+            print("idx: %d" % idx)
+            print_shape(item1)
+    else:
+        print("shape:", item.shape)
+def process_outputs(output_mode, truth, pred, output_truth, output_pred, ignore_index, keep_seq=False):
+    if keep_seq:
+        # to do
+        return None, None
+    else:
+        if output_mode in ["multi_class", "multi-class"]:
+            cur_truth = truth.view(-1)
+            cur_mask = cur_truth != ignore_index
+            cur_pred = pred.view(-1, pred.shape[-1])
+            cur_truth = cur_truth[cur_mask]
+            cur_pred = cur_pred[cur_mask, :]
+            sum_v = cur_mask.sum().item()
+        elif output_mode in ["multi_label", "multi-label"]:
+            cur_truth = truth.view(-1, truth.shape[-1])
+            cur_pred = pred.view(-1, pred.shape[-1])
+            sum_v = pred.shape[0]
+        elif output_mode in ["binary_class", "binary-class"]:
+            cur_truth = truth.view(-1)
+            cur_mask = cur_truth != ignore_index
+            cur_pred = pred.view(-1)
+            cur_truth = cur_truth[cur_mask]
+            cur_pred = cur_pred[cur_mask]
+            sum_v = cur_mask.sum().item()
+        elif output_mode in ["regression"]:
+            cur_truth = truth.view(-1)
+            cur_mask = cur_truth != ignore_index
+            cur_pred = pred.view(-1)
+            cur_truth = cur_truth[cur_mask]
+            cur_pred = cur_pred[cur_mask]
+            sum_v = cur_mask.sum().item()
+        else:
+            raise Exception("not output mode: %s" % output_mode)
+        if sum_v > 0:
+            cur_truth = cur_truth.detach().cpu().numpy()
+            cur_pred = cur_pred.detach().cpu().numpy()
+            if output_truth is None or output_pred is None:
+                return cur_truth, cur_pred
+            else:
+                output_truth = np.append(output_truth, cur_truth,  axis=0)
+                output_pred = np.append(output_pred, cur_pred,  axis=0)
+                return output_truth, output_pred
+    return truth, pred
+def print_batch(value, key=None, debug_path=None, wfp=None, local_rank=-1):
+    '''
+    print a batch
+    :param value:
+    :param key:
+    :param debug_path:
+    :param wfp:
+    :param local_rank:
+    :return:
+    '''
+    if isinstance(value, list):
+        for idx, v in enumerate(value):
+            if wfp is not None:
+                if v is not None:
+                    wfp.write(str([torch.min(v), torch.min(torch.where(v == -100, 10000, v)), torch.max(v)]) + "\n")
+                    wfp.write(str(v.shape) + "\n")
+                else:
+                    wfp.write("None\n")
+                wfp.write("-" * 10 + "\n")
+            else:
+                if v is not None:
+                    print([torch.min(v), torch.min(torch.where(v == -100, 10000, v)), torch.max(v)])
+                    print(v.shape)
+                else:
+                    print("None")
+                print("-" * 50)
+            if v is not None:
+                try:
+                    value = v.detach().cpu().numpy().astype(int)
+                    if debug_path is not None:
+                        if value.ndim == 3:
+                            for dim_1_idx in range(value.shape[0]):
+                                np.savetxt(os.path.join(debug_path, "%s_batch_%d.txt" % (key, dim_1_idx)), value[dim_1_idx, :, :], fmt='%i', delimiter=",")
+                        else:
+                            np.savetxt(os.path.join(debug_path, "%d.txt" % idx), value, fmt='%i', delimiter=",")
+                    else:
+                        if value.ndim == 3:
+                            for dim_1_idx in range(value.shape[0]):
+                                np.savetxt(os.path.join(debug_path, "%s_batch_%d.txt" % (key, dim_1_idx)), value[dim_1_idx, :, :], fmt='%i', delimiter=",")
+                        else:
+                            np.savetxt("%d.txt" % idx, value, fmt='%i', delimiter=",")
+                except Exception as e:
+                    print(e)
+    elif isinstance(value, dict):
+        for item in value.items():
+            if wfp is not None:
+                wfp.write(str(item[0]) + ":\n")
+            else:
+                print(str(item[0]) + ':')
+            print_batch(item[1], item[0], debug_path, wfp, local_rank)
+    else:
+        if wfp is not None:
+            if value is not None:
+                wfp.write(str([torch.min(value), torch.min(torch.where(value == -100, 10000, value)), torch.max(value)]) + "\n")
+                wfp.write(str(value.shape) + "\n")
+            else:
+                wfp.write("None\n")
+            wfp.write("-" * 10 + "\n")
+        else:
+            if value is not None:
+                print([torch.min(value), torch.min(torch.where(value == -100, 10000, value)), torch.max(value)])
+                print(value.shape)
+            else:
+                print("None")
+            print("-" * 10)
+        if value is not None:
+            if key != "prot_structure":
+                fmt = '%i'
+                d_type = int
+            else:
+                fmt = '%0.4f'
+                d_type = float
+            try:
+                value = value.detach().cpu().numpy().astype(d_type)
+                if debug_path is not None:
+                    if value.ndim == 3:
+                        for dim_1_idx in range(value.shape[0]):
+                            np.savetxt(os.path.join(debug_path, "%s_batch_%d.txt" % (key, dim_1_idx)), value[dim_1_idx, :, :], fmt=fmt, delimiter=",")
+                    else:
+                        np.savetxt(os.path.join(debug_path, "%s.txt" % key), value, fmt=fmt, delimiter=",")
+                else:
+                    if value.ndim == 3:
+                        for dim_1_idx in range(value.shape[0]):
+                            np.savetxt("%s_batch_%d.txt" % (key, dim_1_idx), value[dim_1_idx, :, :], fmt=fmt, delimiter=",")
+                    else:
+                        np.savetxt("%s.txt" % key, value, fmt=fmt, delimiter=",")
+            except Exception as e:
+                print(e)
+def gcd(x, y):
+    '''
+    最大公约数
+    :param x:
+    :param y:
+    :return:
+    '''
+    m = max(x, y)
+    n = min(x, y)
+    while m % n:
+        m, n = n, m % n
+    return n
+def lcm(x, y):
+    '''
+    最小公倍数
+    :param x:
+    :param y:
+    :return:
+    '''
+    m = max(x, y)
+    n = min(x, y)
+    while m % n:
+        m, n = n, m % n
+    return x*y//n
+def device_memory(gpu_id):
+    if gpu_id is None or gpu_id < 0:
+        return
+    pynvml.nvmlInit()
+    device_cnt = pynvml.nvmlDeviceGetCount()
+    for idx in range(device_cnt):
+        if gpu_id is not None and gpu_id != idx:
+            continue
+        handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        print(f"Device {idx}: {pynvml.nvmlDeviceGetName(handle)}")
+        print(f"Total memory: {info.total / 1024**3:.8f} GB")
+        print(f"Used memory: {info.used / 1024**3:.8f} GB")
+        print(f"Free memory: {info.free / 1024**3:.8f} GB")
+        pynvml.nvmlShutdown()
+def calc_emb_filename_by_seq_id(seq_id, embedding_type):
+    """
+    根据seq_id得到emb_filename
+    :param seq_id:
+    :param embedding_type:
+    :return:
+    """
+    if seq_id[0] == ">":
+        seq_id = seq_id[1:]
+    if "|" in seq_id:
+        strs = seq_id.split("|")
+        if len(strs) > 1:
+            emb_filename = embedding_type + "_" + strs[1].strip() + ".pt"
+        else:
+            emb_filename = embedding_type + "_" + seq_id.replace(" ", "").replace("/", "_") + ".pt"
+    else:
+        emb_filename = embedding_type + "_" + seq_id.replace(" ", "").replace("/", "_") + ".pt"
+    return emb_filename
+def download_file(url, local_filename):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        dir_name = os.path.dirname(local_filename)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        with open(local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk: # filter out keep-alive new chunks
+                    f.write(chunk)
+    return local_filename
+def download_folder(base_url, file_names, local_dir):
+    if not os.path.exists(local_dir):
+        os.makedirs(local_dir)
+    for file_name in file_names:
+        file_url = f"{base_url}/{file_name}"
+        local_filename = os.path.join(local_dir, file_name)
+        download_file(file_url, local_filename)
+        print(f"Downloaded {file_name}")
+def download_trained_checkpoint_lucaone(
+        llm_dir,
+        llm_type="lucaone_gplm",
+        llm_version="v2.0",
+        llm_task_level="token_level,span_level,seq_level,structure_level",
+        llm_time_str="20231125113045",
+        llm_step="5600000",
+        base_url="http://47.93.21.181/lucaone/TrainedCheckPoint"
+):
+    """
+    donwload trained checkpoint of LucaOne
+    :param llm_dir:
+    :param llm_type:
+    :param llm_version:
+    :param llm_task_level:
+    :param llm_time_str:
+    :param llm_step:
+    :param base_url:
+    :return:
+    """
+    print("------Download Trained LLM(LucaOne)------")
+    try:
+        logs_file_names = ["logs.txt"]
+        models_file_names = ["config.json", "pytorch.pth", "training_args.bin", "tokenizer/alphabet.pkl"]
+        logs_path = "logs/lucagplm/%s/%s/%s/%s" % (llm_version, llm_task_level, llm_type, llm_time_str)
+        models_path = "models/lucagplm/%s/%s/%s/%s/checkpoint-step%s" % (llm_version, llm_task_level, llm_type, llm_time_str, llm_step)
+        logs_local_dir = os.path.join(llm_dir, logs_path)
+        exists = True
+        for logs_file_name in logs_file_names:
+            if not os.path.exists(os.path.join(logs_local_dir, logs_file_name)):
+                exists = False
+                break
+        models_local_dir = os.path.join(llm_dir, models_path)
+        if exists:
+            for models_file_name in models_file_names:
+                if not os.path.exists(os.path.join(models_local_dir, models_file_name)):
+                    exists = False
+                    break
+        if not exists:
+            print("*" * 20 + "Downloading" + "*" * 20)
+            print("Downloading LucaOne TrainedCheckPoint: LucaOne-%s-%s-%s ..." % (llm_version, llm_time_str, llm_step))
+            print("Wait a moment, please.")
+            # download logs
+            if not os.path.exists(logs_local_dir):
+                os.makedirs(logs_local_dir)
+            logs_base_url = os.path.join(base_url, logs_path)
+            download_folder(logs_base_url, logs_file_names, logs_local_dir)
+            # download models
+            if not os.path.exists(models_local_dir):
+                os.makedirs(models_local_dir)
+            models_base_url = os.path.join(base_url, models_path)
+            download_folder(models_base_url, models_file_names, models_local_dir)
+            print("LucaOne Download Succeed.")
+            print("*" * 50)
+    except Exception as e:
+        print(e)
+        print("Download automatically LucaOne Trained CheckPoint failed!")
+        print("You can manually download 'logs/' and 'models/' into local directory: %s/ from %s" % (os.path.abspath(llm_dir), os.path.join(base_url, "TrainedCheckPoint/")))
+        raise Exception(e)
+def download_trained_checkpoint_downstream_tasks(
+        save_dir="../",
+        dataset_name=["CentralDogma", "GenusTax", "InfA", "ncRNAFam", "ncRPI", "PPI", "ProtLoc", "ProtStab", "SpeciesTax", "SupKTax"],
+        dataset_type=["gene_protein", "gene", "gene_gene", "gene", "gene_protein", "protein", "protein", "protein", "gene", "gene"],
+        task_type=["binary_class", "multi_class", "binary_class", "multi_class", "binary_class", "binary_class", "multi_class", "regression", "multi_class", "multi_class"],
+        model_type=["lucappi2", "luca_base", "lucappi", "luca_base", "lucappi2", "lucappi", "luca_base", "luca_base", "luca_base", "luca_base"],
+        input_type=["matrix", "matrix", "matrix", "matrix", "matrix", "matrix", "matrix", "matrix", "matrix", "matrix"],
+        time_str=["20240406173806", "20240412100337", "20240214105653", "20240414155526", "20240404105148", "20240216205421", "20240412140824", "20240404104215", "20240411144916", "20240212202328"],
+        step=[64000, 24500, 9603, 1958484, 716380, 52304, 466005, 70371, 24000, 37000],
+        base_url="http://47.93.21.181/lucaone/DownstreamTasksTrainedModels"
+):
+    """
+    donwload trained downstream task models
+    :param save_dir: 本地保存路径
+    :param dataset_name:
+    :param dataset_type:
+    :param task_type:
+    :param model_type:
+    :param input_type:
+    :param time_str:
+    :param step:
+    :param base_url:
+    :return:
+    """
+    assert len(dataset_name) == len(dataset_type) == len(task_type) == \
+           len(model_type) == len(input_type) == len(time_str) == len(step)
+    assert isinstance(dataset_name, list)
+    assert isinstance(dataset_type, list)
+    assert isinstance(task_type, list)
+    assert isinstance(model_type, list)
+    assert isinstance(input_type, list)
+    assert isinstance(time_str, list)
+    assert isinstance(step, list)
+    download_succeed_task_num = 0
+    print("------Download Trained Models------")
+    for idx in range(len(dataset_name)):
+        try:
+            logs_file_names = ["logs.txt", "label.txt"]
+            models_file_names = ["config.json", "pytorch_model.bin", "training_args.bin", "tokenizer/alphabet.pkl"]
+            logs_path = "logs/%s/%s/%s/%s/%s/%s" % (dataset_name[idx], dataset_type[idx], task_type[idx], model_type[idx], input_type[idx], time_str[idx])
+            models_path = "models/%s/%s/%s/%s/%s/%s/checkpoint-%s" % (dataset_name[idx], dataset_type[idx], task_type[idx], model_type[idx], input_type[idx], time_str[idx], str(step[idx]))
+            logs_local_dir = os.path.join(save_dir, logs_path)
+            exists = True
+            for logs_file_name in logs_file_names:
+                if not os.path.exists(os.path.join(logs_local_dir, logs_file_name)):
+                    exists = False
+                    break
+            models_local_dir = os.path.join(save_dir, models_path)
+            if exists:
+                for models_file_name in models_file_names:
+                    if not os.path.exists(os.path.join(models_local_dir, models_file_name)):
+                        exists = False
+                        break
+            if not exists:
+                print("*" * 20 + "Downloading" + "*" * 20)
+                print("Downloading Downstream Task: %s TrainedCheckPoint: %s-%s-%s ..." % (dataset_name[idx], dataset_name[idx], time_str[idx], str(step[idx])))
+                print("Wait a moment, please.")
+                # download logs
+                if not os.path.exists(logs_local_dir):
+                    os.makedirs(logs_local_dir)
+                logs_base_url = os.path.join(base_url, dataset_name[idx], logs_path)
+                download_folder(logs_base_url, logs_file_names, logs_local_dir)
+                # download models
+                if not os.path.exists(models_local_dir):
+                    os.makedirs(models_local_dir)
+                models_base_url = os.path.join(base_url, dataset_name[idx], models_path)
+                download_folder(models_base_url, models_file_names, models_local_dir)
+                print("Downstream Task: %s Trained Model Download Succeed." % dataset_name[idx])
+                print("*" * 50)
+            download_succeed_task_num += 1
+        except Exception as e:
+            print(e)
+            print("Download automatically LucaDownstream Task: %s Trained CheckPoint failed!" %  dataset_name[idx])
+            print("You can manually download 'logs/' and 'models/' into local directory: %s/ from %s" % (os.path.abspath(save_dir), os.path.join(base_url, dataset_name[idx])))
+            raise Exception(e)
+    print("%d Downstream Task Trained Model Download Succeed." % download_succeed_task_num)