Spaces:
Runtime error
Runtime error
File size: 5,661 Bytes
562ccd6 9232135 562ccd6 bf27007 562ccd6 a8b401c 10692da a8b401c fc1e518 a8b401c 562ccd6 0bc7b8a bf27007 562ccd6 fc1e518 562ccd6 0bc7b8a 9232135 fc1e518 2b313b9 9232135 756e8c2 9232135 756e8c2 fc1e518 562ccd6 756e8c2 fc1e518 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import gradio as gr
import os
import gc
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.regularizers import l2
from keras.constraints import max_norm
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization
from keras.layers import Embedding, Bidirectional, LSTM, CuDNNLSTM, GlobalMaxPooling1D
import tensorflow as tf
from huggingface_hub import hf_hub_url, cached_download
class Sequence:
codes = {c: i+1 for i, c in enumerate(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])}
@classmethod
def integer_encoding(cls, data):
"""
- Encodes code sequence to integer values.
- 20 common amino acids are taken into consideration
and remaining four are categorized as 0.
"""
return np.array([cls.codes.get(code, 0) for code in data])
@classmethod
def prepare(cls, sequence):
sequence = sequence.strip().upper()
ie = cls.integer_encoding(sequence)
max_length = 100
padded_ie = pad_sequences([ie], maxlen=max_length, padding='post', truncating='post')
all_ohe = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] + [0]*(100-21))
return padded_ie, to_categorical(np.array([padded_ie[0], all_ohe]))[:1]
def residual_block(data, filters, d_rate):
"""
_data: input
_filters: convolution filters
_d_rate: dilation rate
"""
shortcut = data
bn1 = BatchNormalization()(data)
act1 = Activation('relu')(bn1)
conv1 = Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=l2(0.001))(act1)
#bottleneck convolution
bn2 = BatchNormalization()(conv1)
act2 = Activation('relu')(bn2)
conv2 = Conv1D(filters, 3, padding='same', kernel_regularizer=l2(0.001))(act2)
#skip connection
x = Add()([conv2, shortcut])
return x
def get_model():
# model
x_input = Input(shape=(100, 21))
#initial conv
conv = Conv1D(128, 1, padding='same')(x_input)
# per-residue representation
res1 = residual_block(conv, 128, 2)
res2 = residual_block(res1, 128, 3)
x = MaxPooling1D(3)(res2)
x = Dropout(0.5)(x)
# softmax classifier
x = Flatten()(x)
x_output = Dense(1000, activation='softmax', kernel_regularizer=l2(0.0001))(x)
model2 = Model(inputs=x_input, outputs=x_output)
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model2.h5'))
model2.load_weights(weights)
return model2
def get_lstm_model():
x_input = Input(shape=(100,))
emb = Embedding(21, 128, input_length=100)(x_input)
bi_rnn = Bidirectional(LSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))(emb)
# bi_rnn = CuDNNLSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))(emb)
x = Dropout(0.3)(bi_rnn)
# softmax classifier
x_output = Dense(1000, activation='softmax')(x)
model1 = Model(inputs=x_input, outputs=x_output)
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model1.h5'))
model1.load_weights(weights)
return model1
cnn_model = get_model()
lstm_model = get_lstm_model()
mappings_path = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'prot_mappings.json'))
with open(mappings_path) as f:
prot_mappings = json.load(f)
def greet(Amino_Acid_Sequence):
padded_seq, processed_seq = Sequence.prepare(Amino_Acid_Sequence)
cnn_raw_prediction = cnn_model.predict(processed_seq)[0]
lstm_raw_prediction = lstm_model.predict(padded_seq)[0]
joined_prediction = cnn_raw_prediction*0.7 + lstm_raw_prediction*0.3
cnn_idx = cnn_raw_prediction.argmax()
lstm_idx = lstm_raw_prediction.argmax()
idx = joined_prediction.argmax()
cnn_fam_asc = prot_mappings['id2fam_asc'][str(cnn_idx)]
cnn_fam_id = prot_mappings['fam_asc2fam_id'][cnn_fam_asc]
lstm_fam_asc = prot_mappings['id2fam_asc'][str(lstm_idx)]
lstm_fam_id = prot_mappings['fam_asc2fam_id'][lstm_fam_asc]
fam_asc = prot_mappings['id2fam_asc'][str(idx)]
fam_id = prot_mappings['fam_asc2fam_id'][fam_asc]
idprobs = {prot_mappings['id2fam_asc'][str(i)]: float(joined_prediction[i]) for i in range(len(joined_prediction))}
famprobs = {prot_mappings['fam_asc2fam_id'][fam_asc]: pred for fam_asc, pred in idprobs.items()}
gc.collect()
return famprobs, idprobs, f"""
Input is {Amino_Acid_Sequence}.
Processed input is:
{processed_seq}
CNN says: Family Accession={cnn_fam_asc} and ID={cnn_fam_id}
LSTM says: Family Accession={lstm_fam_asc} and ID={lstm_fam_id}
0.7 * cnn and 0.3 * lstm ensemble model makes prediction which maps to:
Family Accession={fam_asc} and ID={fam_id}
Raw Joined Prediction:
{joined_prediction}
"""
iface = gr.Interface(fn=greet, inputs="text", outputs=[gr.Label(num_top_classes=11, label="Family Accession Predictions"), gr.Label(num_top_classes=11, label="Family ID Predictions"), "text"])
iface.launch()
|