Spaces:
Runtime error
Runtime error
import gradio as gr | |
import os | |
import gc | |
import json | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from collections import Counter | |
from sklearn.preprocessing import LabelEncoder | |
from keras.models import Model | |
from keras.regularizers import l2 | |
from keras.constraints import max_norm | |
from keras.utils import to_categorical | |
from keras.preprocessing.text import Tokenizer | |
from keras.utils import pad_sequences | |
from keras.callbacks import EarlyStopping | |
from keras.layers import Input, Dense, Dropout, Flatten, Activation | |
from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization | |
from keras.layers import Embedding, Bidirectional, LSTM, CuDNNLSTM, GlobalMaxPooling1D | |
import tensorflow as tf | |
from huggingface_hub import hf_hub_url, cached_download | |
class Sequence: | |
codes = {c: i+1 for i, c in enumerate(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])} | |
def integer_encoding(cls, data): | |
""" | |
- Encodes code sequence to integer values. | |
- 20 common amino acids are taken into consideration | |
and remaining four are categorized as 0. | |
""" | |
return np.array([cls.codes.get(code, 0) for code in data]) | |
def prepare(cls, sequence): | |
sequence = sequence.strip().upper() | |
ie = cls.integer_encoding(sequence) | |
max_length = 100 | |
padded_ie = pad_sequences([ie], maxlen=max_length, padding='post', truncating='post') | |
all_ohe = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] + [0]*(100-21)) | |
return padded_ie, to_categorical(np.array([padded_ie[0], all_ohe]))[:1] | |
def residual_block(data, filters, d_rate): | |
""" | |
_data: input | |
_filters: convolution filters | |
_d_rate: dilation rate | |
""" | |
shortcut = data | |
bn1 = BatchNormalization()(data) | |
act1 = Activation('relu')(bn1) | |
conv1 = Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=l2(0.001))(act1) | |
#bottleneck convolution | |
bn2 = BatchNormalization()(conv1) | |
act2 = Activation('relu')(bn2) | |
conv2 = Conv1D(filters, 3, padding='same', kernel_regularizer=l2(0.001))(act2) | |
#skip connection | |
x = Add()([conv2, shortcut]) | |
return x | |
def get_model(): | |
# model | |
x_input = Input(shape=(100, 21)) | |
#initial conv | |
conv = Conv1D(128, 1, padding='same')(x_input) | |
# per-residue representation | |
res1 = residual_block(conv, 128, 2) | |
res2 = residual_block(res1, 128, 3) | |
x = MaxPooling1D(3)(res2) | |
x = Dropout(0.5)(x) | |
# softmax classifier | |
x = Flatten()(x) | |
x_output = Dense(1000, activation='softmax', kernel_regularizer=l2(0.0001))(x) | |
model2 = Model(inputs=x_input, outputs=x_output) | |
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) | |
weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model2.h5')) | |
model2.load_weights(weights) | |
return model2 | |
def get_lstm_model(): | |
x_input = Input(shape=(100,)) | |
emb = Embedding(21, 128, input_length=100)(x_input) | |
bi_rnn = Bidirectional(LSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))(emb) | |
# bi_rnn = CuDNNLSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))(emb) | |
x = Dropout(0.3)(bi_rnn) | |
# softmax classifier | |
x_output = Dense(1000, activation='softmax')(x) | |
model1 = Model(inputs=x_input, outputs=x_output) | |
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) | |
weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model1.h5')) | |
model1.load_weights(weights) | |
return model1 | |
cnn_model = get_model() | |
lstm_model = get_lstm_model() | |
mappings_path = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'prot_mappings.json')) | |
with open(mappings_path) as f: | |
prot_mappings = json.load(f) | |
def greet(Amino_Acid_Sequence): | |
padded_seq, processed_seq = Sequence.prepare(Amino_Acid_Sequence) | |
cnn_raw_prediction = cnn_model.predict(processed_seq)[0] | |
lstm_raw_prediction = lstm_model.predict(padded_seq)[0] | |
joined_prediction = cnn_raw_prediction*0.7 + lstm_raw_prediction*0.3 | |
cnn_idx = cnn_raw_prediction.argmax() | |
lstm_idx = lstm_raw_prediction.argmax() | |
idx = joined_prediction.argmax() | |
cnn_fam_asc = prot_mappings['id2fam_asc'][str(cnn_idx)] | |
cnn_fam_id = prot_mappings['fam_asc2fam_id'][cnn_fam_asc] | |
lstm_fam_asc = prot_mappings['id2fam_asc'][str(lstm_idx)] | |
lstm_fam_id = prot_mappings['fam_asc2fam_id'][lstm_fam_asc] | |
fam_asc = prot_mappings['id2fam_asc'][str(idx)] | |
fam_id = prot_mappings['fam_asc2fam_id'][fam_asc] | |
gc.collect() | |
return f""" | |
Input is {Amino_Acid_Sequence}. | |
Processed input is: | |
{processed_seq} | |
CNN says: Family Accession={cnn_fam_asc} and ID={cnn_fam_id} | |
LSTM says: Family Accession={lstm_fam_asc} and ID={lstm_fam_id} | |
0.7 * cnn and 0.3 * lstm ensemble model makes prediction which maps to: | |
Family Accession={fam_asc} and ID={fam_id} | |
Raw Joined Prediction: | |
{joined_prediction} | |
""" | |
iface = gr.Interface(fn=greet, inputs="text", outputs="text") | |
iface.launch() | |