Spaces:

jonathang
/

Protein-Family-Ensemble

Runtime error

App Files Files Community

Protein-Family-Ensemble / app.py

jonathang

Ensemble!

fc1e518 over 1 year ago

raw

history blame

5.29 kB

	import gradio as gr
	import os
	import gc
	import json

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	from collections import Counter

	from sklearn.preprocessing import LabelEncoder

	from keras.models import Model
	from keras.regularizers import l2
	from keras.constraints import max_norm
	from keras.utils import to_categorical
	from keras.preprocessing.text import Tokenizer
	from keras.utils import pad_sequences
	from keras.callbacks import EarlyStopping
	from keras.layers import Input, Dense, Dropout, Flatten, Activation
	from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization
	from keras.layers import Embedding, Bidirectional, LSTM, CuDNNLSTM, GlobalMaxPooling1D

	import tensorflow as tf
	from huggingface_hub import hf_hub_url, cached_download


	class Sequence:
	codes = {c: i+1 for i, c in enumerate(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])}

	@classmethod
	def integer_encoding(cls, data):
	"""
	- Encodes code sequence to integer values.
	- 20 common amino acids are taken into consideration
	and remaining four are categorized as 0.
	"""
	return np.array([cls.codes.get(code, 0) for code in data])

	@classmethod
	def prepare(cls, sequence):
	sequence = sequence.strip().upper()
	ie = cls.integer_encoding(sequence)
	max_length = 100
	padded_ie = pad_sequences([ie], maxlen=max_length, padding='post', truncating='post')
	all_ohe = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] + [0]*(100-21))
	return padded_ie, to_categorical(np.array([padded_ie[0], all_ohe]))[:1]


	def residual_block(data, filters, d_rate):
	"""
	_data: input
	_filters: convolution filters
	_d_rate: dilation rate
	"""

	shortcut = data

	bn1 = BatchNormalization()(data)
	act1 = Activation('relu')(bn1)
	conv1 = Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=l2(0.001))(act1)

	#bottleneck convolution
	bn2 = BatchNormalization()(conv1)
	act2 = Activation('relu')(bn2)
	conv2 = Conv1D(filters, 3, padding='same', kernel_regularizer=l2(0.001))(act2)

	#skip connection
	x = Add()([conv2, shortcut])

	return x

	def get_model():
	# model
	x_input = Input(shape=(100, 21))

	#initial conv
	conv = Conv1D(128, 1, padding='same')(x_input)

	# per-residue representation
	res1 = residual_block(conv, 128, 2)
	res2 = residual_block(res1, 128, 3)

	x = MaxPooling1D(3)(res2)
	x = Dropout(0.5)(x)

	# softmax classifier
	x = Flatten()(x)
	x_output = Dense(1000, activation='softmax', kernel_regularizer=l2(0.0001))(x)

	model2 = Model(inputs=x_input, outputs=x_output)
	model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
	weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model2.h5'))
	model2.load_weights(weights)

	return model2

	def get_lstm_model():
	x_input = Input(shape=(100,))
	emb = Embedding(21, 128, input_length=100)(x_input)
	bi_rnn = Bidirectional(LSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))(emb)
	# bi_rnn = CuDNNLSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))(emb)
	x = Dropout(0.3)(bi_rnn)

	# softmax classifier
	x_output = Dense(1000, activation='softmax')(x)

	model1 = Model(inputs=x_input, outputs=x_output)
	model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
	weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model1.h5'))
	model1.load_weights(weights)
	return model1


	cnn_model = get_model()
	lstm_model = get_lstm_model()

	mappings_path = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'prot_mappings.json'))
	with open(mappings_path) as f:
	prot_mappings = json.load(f)

	def greet(Amino_Acid_Sequence):
	padded_seq, processed_seq = Sequence.prepare(Amino_Acid_Sequence)
	cnn_raw_prediction = cnn_model.predict(processed_seq)[0]
	lstm_raw_prediction = lstm_model.predict(padded_seq)[0]
	joined_prediction = cnn_raw_prediction0.7 + lstm_raw_prediction0.3
	cnn_idx = cnn_raw_prediction.argmax()
	lstm_idx = lstm_raw_prediction.argmax()
	idx = joined_prediction.argmax()
	cnn_fam_asc = prot_mappings['id2fam_asc'][str(cnn_idx)]
	cnn_fam_id = prot_mappings['fam_asc2fam_id'][cnn_fam_asc]
	lstm_fam_asc = prot_mappings['id2fam_asc'][str(lstm_idx)]
	lstm_fam_id = prot_mappings['fam_asc2fam_id'][lstm_fam_asc]
	fam_asc = prot_mappings['id2fam_asc'][str(idx)]
	fam_id = prot_mappings['fam_asc2fam_id'][fam_asc]
	gc.collect()
	return f"""
	Input is {Amino_Acid_Sequence}.
	Processed input is:
	{processed_seq}

	CNN says: Family Accession={cnn_fam_asc} and ID={cnn_fam_id}
	LSTM says: Family Accession={lstm_fam_asc} and ID={lstm_fam_id}

	0.7 * cnn and 0.3 * lstm ensemble model makes prediction which maps to:
	Family Accession={fam_asc} and ID={fam_id}

	Raw Joined Prediction:
	{joined_prediction}
	"""

	iface = gr.Interface(fn=greet, inputs="text", outputs="text")
	iface.launch()