update

Browse files

Files changed (5) hide show

README.md +32 -0
config.json +9 -10
configuration_mprna.py → configuration_omnigenome.py +21 -72
model.safetensors +2 -2
modeling_mprna.py → modeling_omnigenome.py +358 -203

README.md CHANGED Viewed

@@ -1,3 +1,35 @@
 ---
 license: mit
 ---

 ---
 license: mit
+language:
+  - rna
+  - dna
+tags:
+  - Genomic-Language-Modeling
+  - OmniGenome Foundation Model
 ---
+# Multi-species Foundation Model for Universal RNA and DNA Downstream Tasks
+# Notes
+We are keep updating the checkpoints, the current checkpoint is trained for 0.85 epoch.
+## Training Examples
+Refer to GitHub [https://github.com/yangheng95/OmniGenome](https://github.com/yangheng95/OmniGenome)
+## Usage
+This model is available for replacing genomic foundation models such as CDSBERT, Nucleotide Transformers, DNABERT2, etc.
+```
+from transformers import AutoModel
+model = AutoModel.from_pretrained("yangheng/OmniGenome-52M", trust_remote_code=True)
+```
+## Subtasks
+- Secondary structure prediction
+- Genome Sequence Classification
+- Genome Sequence Regression
+- Single Nucleotide Repair
+- Genome Masked Language Modeling
+- etc.
+Part of the codes are adapted from ESM2.

config.json CHANGED Viewed

@@ -1,17 +1,16 @@
 {
-  "MPRNAfold_config": null,
-  "_name_or_path": "../output/checkpoint-500-legacy",
   "architectures": [
-    "MPRNAForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_mprna.MPRNAConfig",
-    "AutoModel": "modeling_mprna.MPRNAModel",
-    "AutoModelForMaskedLM": "modeling_mprna.MPRNAForMaskedLM",
-    "AutoModelForSequenceClassification": "modeling_mprna.RNA2StructForSequenceClassification",
-    "AutoModelForTokenClassification": "modeling_mprna.RNA2StructForTokenClassification",
-    "AutoTokenizer": "tokenization_mprna.MPRNATokenizer"
   },
   "classifier_dropout": null,
   "emb_layer_norm_before": false,
@@ -24,7 +23,7 @@
   "layer_norm_eps": 1e-05,
   "mask_token_id": 23,
   "max_position_embeddings": 1026,
-  "model_type": "mprna",
   "num_attention_heads": 30,
   "num_hidden_layers": 32,
   "pad_token_id": 1,

 {
+  "OmniGenomefold_config": null,
+  "_name_or_path": "./",
   "architectures": [
+    "OmniGenomeForTokenClassification"
   ],
   "attention_probs_dropout_prob": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration_omnigenome.OmniGenomeConfig",
+    "AutoModel": "modeling_omnigenome.OmniGenomeModel",
+    "AutoModelForMaskedLM": "modeling_omnigenome.OmniGenomeForMaskedLM",
+    "AutoModelForSeq2SeqLM": "modeling_omnigenome.OmniGenomeForSeq2SeqLM",
+    "AutoModelForTokenClassification": "modeling_omnigenome.OmniGenomeForTokenClassification"
   },
   "classifier_dropout": null,
   "emb_layer_norm_before": false,
   "layer_norm_eps": 1e-05,
   "mask_token_id": 23,
   "max_position_embeddings": 1026,
+  "model_type": "omnigenome",
   "num_attention_heads": 30,
   "num_hidden_layers": 32,
   "pad_token_id": 1,

configuration_mprna.py → configuration_omnigenome.py RENAMED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MPRNA model configuration"""
 from dataclasses import asdict, dataclass
 from typing import Optional
@@ -24,18 +24,19 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
 # TODO Update this
-MPRNA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "yangheng/MPRNA-small": "https://huggingface.co/yangheng/MPRNA-small/resolve/main/config.json",
-    # See all MPRNA models at https://huggingface.co/models?filter=MPRNA
 }
-class MPRNAConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`MPRNAModel`]. It is used to instantiate a MPRNA model
     according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the MPRNA
-    [yangheng/MPRNA-small](https://huggingface.co/yangheng/MPRNA-small) architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -43,14 +44,14 @@ class MPRNAConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*):
-            Vocabulary size of the MPRNA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MPRNAModel`].
         mask_token_id (`int`, *optional*):
             The index of the mask token in the vocabulary. This must be included in the config because of the
             "mask-dropout" scaling trick, which will scale the inputs depending on the number of masked tokens.
         pad_token_id (`int`, *optional*):
             The index of the padding token in the vocabulary. This must be included in the config because certain parts
-            of the MPRNA code use this instead of the attention mask.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -89,11 +90,11 @@ class MPRNAConfig(PretrainedConfig):
     Examples:
     ```python
-    # >>> from transformers import MPRNAModel, MPRNAConfig
     #
-    # >>> # Initializing a MPRNA yangheng/MPRNA-small style configuration >>> configuration = MPRNAConfig()
     #
-    # >>> # Initializing a model from the configuration >>> model = MPRNAModel(configuration)
     #
     # >>> # Accessing the model configuration >>> configuration = model.config
     ```"""
@@ -119,7 +120,7 @@ class MPRNAConfig(PretrainedConfig):
         emb_layer_norm_before=None,
         token_dropout=False,
         is_folding_model=False,
-        MPRNAfold_config=None,
         vocab_list=None,
         **kwargs,
     ):
@@ -142,30 +143,13 @@ class MPRNAConfig(PretrainedConfig):
         self.emb_layer_norm_before = emb_layer_norm_before
         self.token_dropout = token_dropout
         self.is_folding_model = is_folding_model
-        if is_folding_model:
-            if MPRNAfold_config is None:
-                logger.info(
-                    "No MPRNAfold_config supplied for folding model, using default values."
-                )
-                MPRNAfold_config = MPRNAFoldConfig()
-            elif isinstance(MPRNAfold_config, dict):
-                MPRNAfold_config = MPRNAFoldConfig(**MPRNAfold_config)
-            self.MPRNAfold_config = MPRNAfold_config
-            if vocab_list is None:
-                logger.warning(
-                    "No vocab_list supplied for folding model, assuming the MPRNA-2 vocabulary!"
-                )
-                self.vocab_list = get_default_vocab_list()
-            else:
-                self.vocab_list = vocab_list
-        else:
-            self.MPRNAfold_config = None
-            self.vocab_list = None
-        if self.MPRNAfold_config is not None and getattr(
-            self.MPRNAfold_config, "use_MPRNA_attn_map", False
         ):
             raise ValueError(
-                "The HuggingFace port of MPRNAFold does not support use_MPRNA_attn_map at this time!"
             )
     def to_dict(self):
@@ -176,41 +160,6 @@ class MPRNAConfig(PretrainedConfig):
             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = super().to_dict()
-        if isinstance(self.MPRNAfold_config, MPRNAFoldConfig):
-            output["MPRNAfold_config"] = self.MPRNAfold_config.to_dict()
-        return output
-@dataclass
-class MPRNAFoldConfig:
-    MPRNA_type: str = None
-    fp16_MPRNA: bool = True
-    use_MPRNA_attn_map: bool = False
-    MPRNA_ablate_pairwise: bool = False
-    MPRNA_ablate_sequence: bool = False
-    MPRNA_input_dropout: float = 0
-    embed_aa: bool = True
-    bypass_lm: bool = False
-    lddt_head_hid_dim: int = 128
-    trunk: "TrunkConfig" = None
-    def __post_init__(self):
-        if self.trunk is None:
-            self.trunk = TrunkConfig()
-        elif isinstance(self.trunk, dict):
-            self.trunk = TrunkConfig(**self.trunk)
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = asdict(self)
-        output["trunk"] = self.trunk.to_dict()
         return output

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" OmniGenome model configuration"""
 from dataclasses import asdict, dataclass
 from typing import Optional
 logger = logging.get_logger(__name__)
 # TODO Update this
+OmniGenome_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "yangheng/OmniGenome-52M": "https://huggingface.co/yangheng/OmniGenome-52M/resolve/main/config.json",
+    "yangheng/OmniGenome-186M": "https://huggingface.co/yangheng/OmniGenome-186M/resolve/main/config.json",
+    # See all OmniGenome models at https://huggingface.co/models?filter=OmniGenome
 }
+class OmniGenomeConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`OmniGenomeModel`]. It is used to instantiate a OmniGenome model
     according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the OmniGenome
+    [yangheng/OmniGenome-52M](https://huggingface.co/yangheng/OmniGenome-52M) architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*):
+            Vocabulary size of the OmniGenome model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`OmniGenomeModel`].
         mask_token_id (`int`, *optional*):
             The index of the mask token in the vocabulary. This must be included in the config because of the
             "mask-dropout" scaling trick, which will scale the inputs depending on the number of masked tokens.
         pad_token_id (`int`, *optional*):
             The index of the padding token in the vocabulary. This must be included in the config because certain parts
+            of the OmniGenome code use this instead of the attention mask.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
     Examples:
     ```python
+    # >>> from transformers import OmniGenomeModel, OmniGenomeConfig
     #
+    # >>> # Initializing a OmniGenome yangheng/OmniGenome-52M style configuration >>> configuration = OmniGenomeConfig()
     #
+    # >>> # Initializing a model from the configuration >>> model = OmniGenomeModel(configuration)
     #
     # >>> # Accessing the model configuration >>> configuration = model.config
     ```"""
         emb_layer_norm_before=None,
         token_dropout=False,
         is_folding_model=False,
+        OmniGenomefold_config=None,
         vocab_list=None,
         **kwargs,
     ):
         self.emb_layer_norm_before = emb_layer_norm_before
         self.token_dropout = token_dropout
         self.is_folding_model = is_folding_model
+        self.OmniGenomefold_config = None
+        self.vocab_list = None
+        if self.OmniGenomefold_config is not None and getattr(
+            self.OmniGenomefold_config, "use_OmniGenome_attn_map", False
         ):
             raise ValueError(
+                "The HuggingFace port of OmniGenomeFold does not support use_OmniGenome_attn_map at this time!"
             )
     def to_dict(self):
             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = super().to_dict()
         return output

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:856b9640ad214afeed65eea9fe2e165093b8a002815b8cf84431211b6efbb4db
-size 743619996

 version https://git-lfs.github.com/spec/v1
+oid sha256:1e9fec6baa4327e0554e927998fe8079d4223517276478567502fb5a6cb59790
+size 745777424

modeling_mprna.py → modeling_omnigenome.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MPRNA model."""
 import math
 from typing import List, Optional, Tuple, Union
@@ -23,25 +23,36 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import add_start_docstrings, PreTrainedModel
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, \
-    BaseModelOutputWithPoolingAndCrossAttentions, MaskedLMOutput, SequenceClassifierOutput, TokenClassifierOutput
-from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from transformers.utils import logging, add_code_sample_docstrings, add_start_docstrings_to_model_forward
-from .configuration_mprna import MPRNAConfig
 logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "yangheng/MPRNA-small"
-_CONFIG_FOR_DOC = "MPRNAConfig"
-MPRNA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "yangheng/MPRNA-small",
-    # This is not a complete list of all MPRNA models!
-    # See all MPRNA models at https://huggingface.co/models?filter=MPRNA
 ]
@@ -59,7 +70,7 @@ def apply_rotary_pos_emb(x, cos, sin):
 def gelu(x):
     """
-    This is the gelu implementation from the original MPRNA repo. Using F.gelu yields subtly wrong results.
     """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
@@ -81,6 +92,7 @@ def average_product_correct(x):
     return normalized
 class RotaryEmbedding(torch.nn.Module):
     """
     Rotary position embeddings based on those in
@@ -118,7 +130,7 @@ class RotaryEmbedding(torch.nn.Module):
         return self._cos_cached, self._sin_cached
     def forward(
-        self, q: torch.Tensor, k: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
             k, seq_dimension=-2
@@ -130,14 +142,15 @@ class RotaryEmbedding(torch.nn.Module):
         )
-class MPRNAContactPredictionHead(nn.Module):
     """Performs symmetrization, apc, and computes a logistic regression on the output features"""
     def __init__(
-        self,
-        in_features: int,
-        bias=True,
-        eos_idx: int = 2,
     ):
         super().__init__()
         self.in_features = in_features
@@ -165,7 +178,8 @@ class MPRNAContactPredictionHead(nn.Module):
         return self.activation(self.regression(attentions).squeeze(3))
-class MPRNAEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
@@ -203,12 +217,12 @@ class MPRNAEmbeddings(nn.Module):
         self.mask_token_id = config.mask_token_id
     def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        inputs_embeds=None,
-        past_key_values_length=0,
     ):
         if position_ids is None:
             if input_ids is not None:
@@ -224,11 +238,11 @@ class MPRNAEmbeddings(nn.Module):
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
-        # Note that if we want to support MPRNA-1 (not 1b!) in future then we need to support an
         # embedding_scale factor here.
         embeddings = inputs_embeds
-        # Matt: MPRNA has the option to handle masking in MLM in a slightly unusual way. If the token_dropout
         # flag is False then it is handled in the same was as BERT/RoBERTa. If it is set to True, however,
         # masked tokens are treated as if they were selected for input dropout and zeroed out.
         # This "mask-dropout" is compensated for when masked tokens are not present, by scaling embeddings by
@@ -240,16 +254,16 @@ class MPRNAEmbeddings(nn.Module):
                 (input_ids == self.mask_token_id).unsqueeze(-1), 0.0
             )
             mask_ratio_train = (
-                0.15 * 0.8
-            )  # Hardcoded as the ratio used in all MPRNA model training runs
             src_lengths = attention_mask.sum(-1)
             mask_ratio_observed = (input_ids == self.mask_token_id).sum(
                 -1
             ).float() / src_lengths
             embeddings = (
-                embeddings
-                * (1 - mask_ratio_train)
-                / (1 - mask_ratio_observed)[:, None, None]
             ).to(embeddings.dtype)
         if self.position_embedding_type == "absolute":
@@ -287,11 +301,12 @@ class MPRNAEmbeddings(nn.Module):
         return position_ids.unsqueeze(0).expand(input_shape)
-class MPRNASelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-            config, "embedding_size"
         ):
             raise ValueError(
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
@@ -312,8 +327,8 @@ class MPRNASelfAttention(nn.Module):
         )
         self.rotary_embeddings = None
         if (
-            self.position_embedding_type == "relative_key"
-            or self.position_embedding_type == "relative_key_query"
         ):
             self.max_position_embeddings = config.max_position_embeddings
             self.distance_embedding = nn.Embedding(
@@ -333,14 +348,14 @@ class MPRNASelfAttention(nn.Module):
         return x.permute(0, 2, 1, 3)
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         mixed_query_layer = self.query(hidden_states)
@@ -370,10 +385,10 @@ class MPRNASelfAttention(nn.Module):
         query_layer = self.transpose_for_scores(mixed_query_layer)
         # Matt: Our BERT model (which this code was derived from) scales attention logits down by sqrt(head_dim).
-        # MPRNA scales the query down by the same factor instead. Modulo numerical stability these are equivalent,
         # but not when rotary embeddings get involved. Therefore, we scale the query here to match the original
-        # MPRNA code and fix rotary embeddings.
-        query_layer = query_layer * self.attention_head_size**-0.5
         if self.is_decoder:
             # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
@@ -392,8 +407,8 @@ class MPRNASelfAttention(nn.Module):
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
         if (
-            self.position_embedding_type == "relative_key"
-            or self.position_embedding_type == "relative_key_query"
         ):
             seq_length = hidden_states.size()[1]
             position_ids_l = torch.arange(
@@ -423,13 +438,13 @@ class MPRNASelfAttention(nn.Module):
                     "bhrd,lrd->bhlr", key_layer, positional_embedding
                 )
                 attention_scores = (
-                    attention_scores
-                    + relative_position_scores_query
-                    + relative_position_scores_key
                 )
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in MPRNAModel forward() function)
             attention_scores = attention_scores + attention_mask
         # Normalize the attention scores to probabilities.
@@ -458,7 +473,8 @@ class MPRNASelfAttention(nn.Module):
         return outputs
-class MPRNASelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -471,11 +487,12 @@ class MPRNASelfOutput(nn.Module):
         return hidden_states
-class MPRNAAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.self = MPRNASelfAttention(config)
-        self.output = MPRNASelfOutput(config)
         self.pruned_heads = set()
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -498,19 +515,19 @@ class MPRNAAttention(nn.Module):
         # Update hyper params and store pruned heads
         self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
         self.self.all_head_size = (
-            self.self.attention_head_size * self.self.num_attention_heads
         )
         self.pruned_heads = self.pruned_heads.union(heads)
     def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
     ):
         hidden_states_ln = self.LayerNorm(hidden_states)
         self_outputs = self.self(
@@ -524,12 +541,13 @@ class MPRNAAttention(nn.Module):
         )
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[
-            1:
-        ]  # add attentions if we output them
         return outputs
-class MPRNAIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -540,7 +558,8 @@ class MPRNAIntermediate(nn.Module):
         return hidden_states
-class MPRNAOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -553,12 +572,13 @@ class MPRNAOutput(nn.Module):
         return hidden_states
-class MPRNALayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = MPRNAAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
@@ -566,20 +586,20 @@ class MPRNALayer(nn.Module):
                 raise RuntimeError(
                     f"{self} should be used as a decoder model if cross attention is added"
                 )
-            self.crossattention = MPRNAAttention(config)
-        self.intermediate = MPRNAIntermediate(config)
-        self.output = MPRNAOutput(config)
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
     ):
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = (
@@ -600,8 +620,8 @@ class MPRNALayer(nn.Module):
             present_key_value = self_attention_outputs[-1]
         else:
             outputs = self_attention_outputs[
-                1:
-            ]  # add self attentions if we output attention weights
         cross_attn_present_key_value = None
         if self.is_decoder and encoder_hidden_states is not None:
@@ -626,7 +646,7 @@ class MPRNALayer(nn.Module):
             )
             attention_output = cross_attention_outputs[0]
             outputs = (
-                outputs + cross_attention_outputs[1:-1]
             )  # add cross attentions if we output attention weights
             # add cross-attn cache to positions 3,4 of present_key_value tuple
@@ -649,12 +669,13 @@ class MPRNALayer(nn.Module):
         return layer_output
-class MPRNAEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList(
-            [MPRNALayer(config) for _ in range(config.num_hidden_layers)]
         )
         self.emb_layer_norm_after = nn.LayerNorm(
             config.hidden_size, eps=config.layer_norm_eps
@@ -662,17 +683,17 @@ class MPRNAEncoder(nn.Module):
         self.gradient_checkpointing = False
     def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
     ):
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -752,8 +773,8 @@ class MPRNAEncoder(nn.Module):
         )
-# Copied from transformers.models.bert.modeling_bert.BertPooler
-class MPRNAPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -768,19 +789,20 @@ class MPRNAPooler(nn.Module):
         return pooled_output
-class MPRNAPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
-    config_class = MPRNAConfig
-    base_model_prefix = "MPRNA"
     supports_gradient_checkpointing = True
     _no_split_modules = [
-        "MPRNALayer",
-        "MPRNAFoldTriangularSelfAttentionBlock",
-        "MPRNAEmbeddings",
     ]
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
@@ -801,7 +823,7 @@ class MPRNAPreTrainedModel(PreTrainedModel):
             module.weight.data.fill_(1.0)
-MPRNA_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -812,12 +834,12 @@ MPRNA_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`MPRNAConfig`]): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
-MPRNA_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
@@ -860,10 +882,11 @@ MPRNA_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare MPRNA Model transformer outputting raw hidden-states without any specific head on top.",
-    MPRNA_START_DOCSTRING,
 )
-class MPRNAModel(MPRNAPreTrainedModel):
     """
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
@@ -880,12 +903,12 @@ class MPRNAModel(MPRNAPreTrainedModel):
         super().__init__(config)
         self.config = config
-        self.embeddings = MPRNAEmbeddings(config)
-        self.encoder = MPRNAEncoder(config)
-        self.pooler = MPRNAPooler(config) if add_pooling_layer else None
-        self.contact_head = MPRNAContactPredictionHead(
             in_features=config.num_hidden_layers * config.num_attention_heads, bias=True
         )
@@ -907,7 +930,7 @@ class MPRNAModel(MPRNAPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
     @add_start_docstrings_to_model_forward(
-        MPRNA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -915,19 +938,19 @@ class MPRNAModel(MPRNAPreTrainedModel):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -1077,9 +1100,10 @@ class MPRNAModel(MPRNAPreTrainedModel):
 @add_start_docstrings(
-    """MPRNA Model with a `language modeling` head on top.""", MPRNA_START_DOCSTRING
 )
-class MPRNAForMaskedLM(MPRNAPreTrainedModel):
     _tied_weights_keys = ["lm_head.decoder.weight"]
     def __init__(self, config):
@@ -1087,14 +1111,13 @@ class MPRNAForMaskedLM(MPRNAPreTrainedModel):
         if config.is_decoder:
             logger.warning(
-                "If you want to use `MPRNAForMaskedLM` make sure `config.is_decoder=False` for "
                 "bi-directional self-attention."
             )
-        self.MPRNA = MPRNAModel(config, add_pooling_layer=False)
-        self.lm_head = MPRNALMHead(config)
-        self.init_weights()
     def get_output_embeddings(self):
         return self.lm_head.decoder
@@ -1103,7 +1126,7 @@ class MPRNAForMaskedLM(MPRNAPreTrainedModel):
         self.lm_head.decoder = new_embeddings
     @add_start_docstrings_to_model_forward(
-        MPRNA_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1112,18 +1135,18 @@ class MPRNAForMaskedLM(MPRNAPreTrainedModel):
         mask="<mask>",
     )
     def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, MaskedLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1137,7 +1160,7 @@ class MPRNAForMaskedLM(MPRNAPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        outputs = self.MPRNA(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1175,11 +1198,12 @@ class MPRNAForMaskedLM(MPRNAPreTrainedModel):
         )
     def predict_contacts(self, tokens, attention_mask):
-        return self.MPRNA.predict_contacts(tokens, attention_mask=attention_mask)
-class MPRNALMHead(nn.Module):
-    """MPRNA Head for masked language modeling."""
     def __init__(self, config):
         super().__init__()
@@ -1201,24 +1225,22 @@ class MPRNALMHead(nn.Module):
 @add_start_docstrings(
     """
-    MPRNA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
     output) e.g. for GLUE tasks.
     """,
-    MPRNA_START_DOCSTRING,
 )
-class MPRNAForSequenceClassification(MPRNAPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.config = config
-        self.MPRNA = MPRNAModel(config, add_pooling_layer=False)
-        self.classifier = MPRNAClassificationHead(config)
-        self.init_weights()
     @add_start_docstrings_to_model_forward(
-        MPRNA_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1226,16 +1248,16 @@ class MPRNAForSequenceClassification(MPRNAPreTrainedModel):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1247,7 +1269,7 @@ class MPRNAForSequenceClassification(MPRNAPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        outputs = self.MPRNA(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1268,7 +1290,7 @@ class MPRNAForSequenceClassification(MPRNAPreTrainedModel):
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
                 elif self.num_labels > 1 and (
-                    labels.dtype == torch.long or labels.dtype == torch.int
                 ):
                     self.config.problem_type = "single_label_classification"
                 else:
@@ -1301,24 +1323,156 @@ class MPRNAForSequenceClassification(MPRNAPreTrainedModel):
 @add_start_docstrings(
     """
-    MPRNA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
     """,
-    MPRNA_START_DOCSTRING,
 )
-class MPRNAForTokenClassification(MPRNAPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.MPRNA = MPRNAModel(config, add_pooling_layer=False)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self.init_weights()
     @add_start_docstrings_to_model_forward(
-        MPRNA_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1326,16 +1480,16 @@ class MPRNAForTokenClassification(MPRNAPreTrainedModel):
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TokenClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1345,7 +1499,7 @@ class MPRNAForTokenClassification(MPRNAPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        outputs = self.MPRNA(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1380,7 +1534,8 @@ class MPRNAForTokenClassification(MPRNAPreTrainedModel):
         )
-class MPRNAClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
     def __init__(self, config):
@@ -1400,7 +1555,7 @@ class MPRNAClassificationHead(nn.Module):
 def create_position_ids_from_input_ids(
-    input_ids, padding_idx, past_key_values_length=0
 ):
     """
     Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
@@ -1414,6 +1569,6 @@ def create_position_ids_from_input_ids(
     # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
     mask = input_ids.ne(padding_idx).int()
     incremental_indices = (
-        torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length
-    ) * mask
     return incremental_indices.long() + padding_idx

 # coding=utf-8
+# Copyright 2022 ColaLab-UoE (https://colalab.ai/), Meta and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" PyTorch OmniGenome model."""
 import math
 from typing import List, Optional, Tuple, Union
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import add_start_docstrings, PreTrainedModel
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.pytorch_utils import (
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import (
+    logging,
+    add_code_sample_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from .configuration_omnigenome import OmniGenomeConfig
 logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "yangheng/OmniGenome-52M"
+_CONFIG_FOR_DOC = "OmniGenomeConfig"
+OmniGenome_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "yangheng/OmniGenome-52M",
+    # This is not a complete list of all OmniGenome models!
+    # See all OmniGenome models at https://huggingface.co/models?filter=OmniGenome
 ]
 def gelu(x):
     """
+    This is the gelu implementation from the original OmniGenome repo. Using F.gelu yields subtly wrong results.
     """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
     return normalized
+# Copied from transformers.models.esm.modeling_esm.RotaryEmbedding
 class RotaryEmbedding(torch.nn.Module):
     """
     Rotary position embeddings based on those in
         return self._cos_cached, self._sin_cached
     def forward(
+            self, q: torch.Tensor, k: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
             k, seq_dimension=-2
         )
+# Copied from transformers.models.esm.modeling_esm.EsmContactPredictionHead with Esm->OmniGenome
+class OmniGenomeContactPredictionHead(nn.Module):
     """Performs symmetrization, apc, and computes a logistic regression on the output features"""
     def __init__(
+            self,
+            in_features: int,
+            bias=True,
+            eos_idx: int = 2,
     ):
         super().__init__()
         self.in_features = in_features
         return self.activation(self.regression(attentions).squeeze(3))
+# Copied from transformers.models.esm.modeling_esm.EsmEmbeddings with Esm->OmniGenome
+class OmniGenomeEmbeddings(nn.Module):
     """
     Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
     """
         self.mask_token_id = config.mask_token_id
     def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            position_ids=None,
+            inputs_embeds=None,
+            past_key_values_length=0,
     ):
         if position_ids is None:
             if input_ids is not None:
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
+        # Note that if we want to support OmniGenome-1 (not 1b!) in future then we need to support an
         # embedding_scale factor here.
         embeddings = inputs_embeds
+        # Matt: OmniGenome has the option to handle masking in MLM in a slightly unusual way. If the token_dropout
         # flag is False then it is handled in the same was as BERT/RoBERTa. If it is set to True, however,
         # masked tokens are treated as if they were selected for input dropout and zeroed out.
         # This "mask-dropout" is compensated for when masked tokens are not present, by scaling embeddings by
                 (input_ids == self.mask_token_id).unsqueeze(-1), 0.0
             )
             mask_ratio_train = (
+                    0.15 * 0.8
+            )  # Hardcoded as the ratio used in all OmniGenome model training runs
             src_lengths = attention_mask.sum(-1)
             mask_ratio_observed = (input_ids == self.mask_token_id).sum(
                 -1
             ).float() / src_lengths
             embeddings = (
+                    embeddings
+                    * (1 - mask_ratio_train)
+                    / (1 - mask_ratio_observed)[:, None, None]
             ).to(embeddings.dtype)
         if self.position_embedding_type == "absolute":
         return position_ids.unsqueeze(0).expand(input_shape)
+# Copied from transformers.models.esm.modeling_esm.EsmSelfAttention with Esm->OmniGenome
+class OmniGenomeSelfAttention(nn.Module):
     def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, "embedding_size"
         ):
             raise ValueError(
                 f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
         )
         self.rotary_embeddings = None
         if (
+                self.position_embedding_type == "relative_key"
+                or self.position_embedding_type == "relative_key_query"
         ):
             self.max_position_embeddings = config.max_position_embeddings
             self.distance_embedding = nn.Embedding(
         return x.permute(0, 2, 1, 3)
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         mixed_query_layer = self.query(hidden_states)
         query_layer = self.transpose_for_scores(mixed_query_layer)
         # Matt: Our BERT model (which this code was derived from) scales attention logits down by sqrt(head_dim).
+        # OmniGenome scales the query down by the same factor instead. Modulo numerical stability these are equivalent,
         # but not when rotary embeddings get involved. Therefore, we scale the query here to match the original
+        # OmniGenome code and fix rotary embeddings.
+        query_layer = query_layer * self.attention_head_size ** -0.5
         if self.is_decoder:
             # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
         if (
+                self.position_embedding_type == "relative_key"
+                or self.position_embedding_type == "relative_key_query"
         ):
             seq_length = hidden_states.size()[1]
             position_ids_l = torch.arange(
                     "bhrd,lrd->bhlr", key_layer, positional_embedding
                 )
                 attention_scores = (
+                        attention_scores
+                        + relative_position_scores_query
+                        + relative_position_scores_key
                 )
         if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in OmniGenomeModel forward() function)
             attention_scores = attention_scores + attention_mask
         # Normalize the attention scores to probabilities.
         return outputs
+# Copied from transformers.models.esm.modeling_esm.EsmSelfOutput with Esm->OmniGenome
+class OmniGenomeSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         return hidden_states
+# Copied from transformers.models.esm.modeling_esm.EsmAttention with Esm->OmniGenome
+class OmniGenomeAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.self = OmniGenomeSelfAttention(config)
+        self.output = OmniGenomeSelfOutput(config)
         self.pruned_heads = set()
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         # Update hyper params and store pruned heads
         self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
         self.self.all_head_size = (
+                self.self.attention_head_size * self.self.num_attention_heads
         )
         self.pruned_heads = self.pruned_heads.union(heads)
     def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
     ):
         hidden_states_ln = self.LayerNorm(hidden_states)
         self_outputs = self.self(
         )
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[
+                                        1:
+                                        ]  # add attentions if we output them
         return outputs
+# Copied from transformers.models.esm.modeling_esm.EsmIntermediate with Esm->OmniGenome
+class OmniGenomeIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
         return hidden_states
+# Copied from transformers.models.esm.modeling_esm.EsmOutput with Esm->OmniGenome
+class OmniGenomeOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
         return hidden_states
+# Copied from transformers.models.esm.modeling_esm.EsmLayer with Esm->OmniGenome
+class OmniGenomeLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
+        self.attention = OmniGenomeAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
                 raise RuntimeError(
                     f"{self} should be used as a decoder model if cross attention is added"
                 )
+            self.crossattention = OmniGenomeAttention(config)
+        self.intermediate = OmniGenomeIntermediate(config)
+        self.output = OmniGenomeOutput(config)
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=None,
+            output_attentions=False,
     ):
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = (
             present_key_value = self_attention_outputs[-1]
         else:
             outputs = self_attention_outputs[
+                      1:
+                      ]  # add self attentions if we output attention weights
         cross_attn_present_key_value = None
         if self.is_decoder and encoder_hidden_states is not None:
             )
             attention_output = cross_attention_outputs[0]
             outputs = (
+                    outputs + cross_attention_outputs[1:-1]
             )  # add cross attentions if we output attention weights
             # add cross-attn cache to positions 3,4 of present_key_value tuple
         return layer_output
+# Copied from transformers.models.esm.modeling_esm.EsmEncoder with Esm->OmniGenome
+class OmniGenomeEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList(
+            [OmniGenomeLayer(config) for _ in range(config.num_hidden_layers)]
         )
         self.emb_layer_norm_after = nn.LayerNorm(
             config.hidden_size, eps=config.layer_norm_eps
         self.gradient_checkpointing = False
     def forward(
+            self,
+            hidden_states,
+            attention_mask=None,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
     ):
         if self.gradient_checkpointing and self.training:
             if use_cache:
         )
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->OmniGenome
+class OmniGenomePooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         return pooled_output
+# Copied from transformers.models.esm.modeling_esm.EsmPreTrainedModel with Esm->OmniGenome
+class OmniGenomePreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
+    config_class = OmniGenomeConfig
+    base_model_prefix = "OmniGenome"
     supports_gradient_checkpointing = True
     _no_split_modules = [
+        "OmniGenomeLayer",
+        "OmniGenomeFoldTriangularSelfAttentionBlock",
+        "OmniGenomeEmbeddings",
     ]
     # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
             module.weight.data.fill_(1.0)
+OmniGenome_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     and behavior.
     Parameters:
+        config ([`OmniGenomeConfig`]): Model configuration class with all the parameters of the
             model. Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
+OmniGenome_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 @add_start_docstrings(
+    "The bare OmniGenome Model transformer outputting raw hidden-states without any specific head on top.",
+    OmniGenome_START_DOCSTRING,
 )
+# Copied from transformers.models.esm.modeling_esm.EsmModel with Esm->OmniGenome
+class OmniGenomeModel(OmniGenomePreTrainedModel):
     """
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
         super().__init__(config)
         self.config = config
+        self.embeddings = OmniGenomeEmbeddings(config)
+        self.encoder = OmniGenomeEncoder(config)
+        self.pooler = OmniGenomePooler(config) if add_pooling_layer else None
+        self.contact_head = OmniGenomeContactPredictionHead(
             in_features=config.num_hidden_layers * config.num_attention_heads, bias=True
         )
             self.encoder.layer[layer].attention.prune_heads(heads)
     @add_start_docstrings_to_model_forward(
+        OmniGenome_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
         r"""
         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
 @add_start_docstrings(
+    """OmniGenome Model with a `language modeling` head on top.""", OmniGenome_START_DOCSTRING
 )
+# Copied from transformers.models.esm.modeling_esm.EsmForMaskedLM with Esm->OmniGenome
+class OmniGenomeForMaskedLM(OmniGenomePreTrainedModel):
     _tied_weights_keys = ["lm_head.decoder.weight"]
     def __init__(self, config):
         if config.is_decoder:
             logger.warning(
+                "If you want to use `OmniGenomeForMaskedLM` make sure `config.is_decoder=False` for "
                 "bi-directional self-attention."
             )
+        self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
+        self.lm_head = OmniGenomeLMHead(config)
+        # self.init_weights()
     def get_output_embeddings(self):
         return self.lm_head.decoder
         self.lm_head.decoder = new_embeddings
     @add_start_docstrings_to_model_forward(
+        OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         mask="<mask>",
     )
     def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, MaskedLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
+        outputs = self.OmniGenome(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
         )
     def predict_contacts(self, tokens, attention_mask):
+        return self.OmniGenome.predict_contacts(tokens, attention_mask=attention_mask)
+# Copied from transformers.models.esm.modeling_esm.EsmLMHead with Esm->OmniGenome
+class OmniGenomeLMHead(nn.Module):
+    """OmniGenome Head for masked language modeling."""
     def __init__(self, config):
         super().__init__()
 @add_start_docstrings(
     """
+    OmniGenome Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
     output) e.g. for GLUE tasks.
     """,
+    OmniGenome_START_DOCSTRING,
 )
+class OmniGenomeForSequenceClassification(OmniGenomePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.config = config
+        self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
+        self.classifier = OmniGenomeClassificationHead(config)
+        # self.init_weights()
     @add_start_docstrings_to_model_forward(
+        OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, SequenceClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
+        outputs = self.OmniGenome(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
                 elif self.num_labels > 1 and (
+                        labels.dtype == torch.long or labels.dtype == torch.int
                 ):
                     self.config.problem_type = "single_label_classification"
                 else:
 @add_start_docstrings(
     """
+    OmniGenome Model with a token classification head on top (a linear layer on top of the hidden-states output)
+    Note that this model is pre-trained for RNA secondary structure prediction and can be used for zero-shot RNA
+    secondary structure prediction. Please find more advanced usages at https://github.com/yangheng95/OmniGenome
+    This model can be fine-tuned for other token classification tasks.
     """,
+    OmniGenome_START_DOCSTRING,
 )
+# Copied from transformers.models.esm.modeling_esm.EsmForTokenClassification with Esm->OmniGenome
+class OmniGenomeForTokenClassification(OmniGenomePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.OmniGenome = OmniGenomeModel(config, add_pooling_layer=False)
+        self.lm_head = OmniGenomeLMHead(config)
+        self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
+        self.classifier = torch.nn.Linear(self.config.hidden_size, self.num_labels)
+        self.activation = torch.nn.Tanh()
+        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        # self.init_weights()
+    @add_start_docstrings_to_model_forward(
+        OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        mlm_outputs = self.OmniGenome(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        try:
+            last_hidden_state = mlm_outputs[0]
+            last_hidden_state = self.dense(last_hidden_state)
+        except:
+            last_hidden_state = mlm_outputs.hidden_states[-1]
+            last_hidden_state = self.dense(last_hidden_state)
+        logits = self.classifier(last_hidden_state)
+        logits = torch.softmax(logits, dim=-1)
+        logits = self.activation(logits)
+        logits = self.dropout(logits)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + mlm_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=mlm_outputs.hidden_states,
+            attentions=mlm_outputs.attentions,
+        )
+    @staticmethod
+    def verify_secondary_structure(structure):
+        structure = list(structure)
+        left_brackets = []
+        right_brackets = []
+        for i, char in enumerate(structure):
+            if char == "(":
+                left_brackets.append(i)
+            elif char == ")":
+                if left_brackets:
+                    left_brackets.pop()
+                else:
+                    right_brackets.append(i)
+        for i in left_brackets:
+            structure[i] = "."
+        for i in right_brackets:
+            structure[i] = "."
+        structure = "".join(structure)
+        return structure
+    def predict_structure(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            **kwargs
+    ) -> List[str]:
+        """
+        Predicts the secondary structure of a sequence given the logits and attention mask.
+        """
+        outputs = self.forward(input_ids, attention_mask, **kwargs)
+        logits = torch.argmax(outputs.logits, dim=-1)
+        lengths = torch.sum(torch.ne(torch.tensor(0), attention_mask), dim=-1)
+        structures = []
+        for i, length in enumerate(lengths):
+            structure = logits[i, :length].cpu().numpy()
+            structure = "".join(self.config.id2label[label] for label in structure)
+            if self.config.verify_ss:
+                structure = self.verify_secondary_structure(structure)
+            structures.append(structure)
+        return structures
+@add_start_docstrings(
+    """
+    OmniGenome Model with a simple genetic algorithm based RNA design head on top.
+    """,
+    OmniGenome_START_DOCSTRING,
+)
+class OmniGenomeMaskedLMForRNADesign(OmniGenomePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.OmniGenome = OmniGenomeForMaskedLM(config)
+        self.num_generation = config.num_generation
+        self.num_population = config.num_population
+        # self.init_weights()
     @add_start_docstrings_to_model_forward(
+        OmniGenome_INPUTS_DOCSTRING.format("batch_size, sequence_length")
     )
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = True,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TokenClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
+        outputs = self.OmniGenome(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
         )
+# Copied from transformers.models.esm.modeling_esm.EsmClassificationHead with Esm->OmniGenome
+class OmniGenomeClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
     def __init__(self, config):
 def create_position_ids_from_input_ids(
+        input_ids, padding_idx, past_key_values_length=0
 ):
     """
     Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
     # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
     mask = input_ids.ne(padding_idx).int()
     incremental_indices = (
+                                  torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length
+                          ) * mask
     return incremental_indices.long() + padding_idx