Upload main.py
Browse files
main.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
from PIL import Image
|
6 |
+
import webbrowser
|
7 |
+
|
8 |
+
from rdkit import Chem
|
9 |
+
from rdkit.Chem import AllChem
|
10 |
+
from rdkit.Chem import Draw
|
11 |
+
from rdkit.Chem import rdChemReactions as Reactions
|
12 |
+
|
13 |
+
import tensorflow as tf
|
14 |
+
from tensorflow import keras
|
15 |
+
from keras.preprocessing import sequence
|
16 |
+
from keras.utils import pad_sequences
|
17 |
+
import keras
|
18 |
+
from keras import backend as K
|
19 |
+
from keras.models import load_model
|
20 |
+
import argparse
|
21 |
+
import h5py
|
22 |
+
import pdb
|
23 |
+
|
24 |
+
|
25 |
+
seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M',
|
26 |
+
'S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']
|
27 |
+
seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}
|
28 |
+
|
29 |
+
|
30 |
+
@st.cache(allow_output_mutation=True)
|
31 |
+
def encodeSeq(seq, seq_dic):
|
32 |
+
if pd.isnull(seq):
|
33 |
+
return [0]
|
34 |
+
else:
|
35 |
+
return [seq_dic[aa] for aa in seq]
|
36 |
+
|
37 |
+
|
38 |
+
@st.cache(allow_output_mutation=True)
|
39 |
+
def load_modelfile(model_string):
|
40 |
+
loaded_model = tf.keras.models.load_model(model_string)
|
41 |
+
return loaded_model
|
42 |
+
|
43 |
+
|
44 |
+
@st.cache(allow_output_mutation=True)
|
45 |
+
def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):
|
46 |
+
Prot_ID = prot_input_str.split(':')[0]
|
47 |
+
Prot_seq = prot_input_str.split(':')[1]
|
48 |
+
prot_dataframe = pd.DataFrame(
|
49 |
+
{'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])
|
50 |
+
prot_dataframe.set_index('Protein_ID')
|
51 |
+
|
52 |
+
prot_dataframe["encoded_sequence"] = prot_dataframe.Sequence.map(
|
53 |
+
lambda a: encodeSeq(a, seq_dic))
|
54 |
+
prot_feature = pad_sequences(
|
55 |
+
prot_dataframe["encoded_sequence"].values, prot_len)
|
56 |
+
|
57 |
+
return prot_feature, Prot_ID
|
58 |
+
|
59 |
+
|
60 |
+
@st.cache(allow_output_mutation=True)
|
61 |
+
def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):
|
62 |
+
|
63 |
+
if kegg_id_flag == 1:
|
64 |
+
KEGG_ID = mol_str
|
65 |
+
kegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]
|
66 |
+
KEGG_ID_info = kegg_df.loc[kegg_id_loc]
|
67 |
+
KEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')
|
68 |
+
|
69 |
+
final_return = KEGG_ID_info_df
|
70 |
+
final_id = KEGG_ID
|
71 |
+
|
72 |
+
else:
|
73 |
+
try:
|
74 |
+
mol_ID = mol_str.split(':')[0]
|
75 |
+
mol_smiles = mol_str.split(':')[1]
|
76 |
+
mol = Chem.MolFromSmiles(mol_smiles)
|
77 |
+
fp1 = AllChem.GetMorganFingerprintAsBitVect(
|
78 |
+
mol, useChirality=True, radius=2, nBits=2048)
|
79 |
+
fp_list = list(np.array(fp1).astype(float))
|
80 |
+
fp_str = list(map(str, fp_list))
|
81 |
+
mol_fp = '\t'.join(fp_str)
|
82 |
+
|
83 |
+
mol_dict = {}
|
84 |
+
mol_dict['Compound_ID'] = mol_ID
|
85 |
+
mol_dict['Smiles'] = mol_smiles
|
86 |
+
mol_dict['morgan_fp_r2'] = mol_fp
|
87 |
+
|
88 |
+
mol_info_df = pd.DataFrame(mol_dict, index=[0])
|
89 |
+
mol_info_df = mol_info_df.set_index('Compound_ID')
|
90 |
+
|
91 |
+
final_return = mol_info_df
|
92 |
+
final_id = mol_ID
|
93 |
+
|
94 |
+
except Exception as error:
|
95 |
+
print('Something wrong with molecule input string...' + repr(error))
|
96 |
+
|
97 |
+
return final_return, final_id
|
98 |
+
|
99 |
+
|
100 |
+
@st.cache(allow_output_mutation=True)
|
101 |
+
def act_df_gen_mol_feature(mol_id, prot_id):
|
102 |
+
act_df = pd.DataFrame(
|
103 |
+
{'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])
|
104 |
+
|
105 |
+
return act_df
|
106 |
+
|
107 |
+
|
108 |
+
@st.cache(allow_output_mutation=True)
|
109 |
+
def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):
|
110 |
+
act_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)
|
111 |
+
comp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split("\t")))
|
112 |
+
comp_feature = comp_feature.astype('float')
|
113 |
+
return comp_feature
|
114 |
+
|
115 |
+
|
116 |
+
@st.cache(allow_output_mutation=True)
|
117 |
+
def model_prediction(compound_feature, enz_feature, model):
|
118 |
+
prediction_vals = model.predict([compound_feature, enz_feature])
|
119 |
+
|
120 |
+
return prediction_vals[0][0]
|
121 |
+
|
122 |
+
|
123 |
+
# loaded_model = load_modelfile('./../CNN_results/model_final.model')
|
124 |
+
|
125 |
+
# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
|
126 |
+
# kegg_df = KEGG_compound_read.reset_index()
|
127 |
+
|
128 |
+
|
129 |
+
def main():
|
130 |
+
graph = tf.compat.v1.get_default_graph()
|
131 |
+
ld_model = tf.keras.models.load_model('./../CNN_results_split_final/Final_model.model')
|
132 |
+
|
133 |
+
KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
|
134 |
+
kegg_df = KEGG_compound_read.reset_index()
|
135 |
+
|
136 |
+
|
137 |
+
# def img_to_bytes(img_path):
|
138 |
+
# img_bytes = Path(img_path).read_bytes()
|
139 |
+
# encoded = base64.b64encode(img_bytes).decode()
|
140 |
+
# return encoded
|
141 |
+
# # st.title('dGPredictor')
|
142 |
+
|
143 |
+
# header_html = "<img src='../figures/header.png'>"
|
144 |
+
|
145 |
+
# st.markdown(
|
146 |
+
# header_html, unsafe_allow_html=True,
|
147 |
+
# )
|
148 |
+
|
149 |
+
|
150 |
+
st.image('./header.png', use_column_width=True)
|
151 |
+
|
152 |
+
st.subheader('Enzyme-Substrate Activity Predictor ')
|
153 |
+
|
154 |
+
st.subheader('Enzyme sequence')
|
155 |
+
st.caption('Please follow the input format show in the text box--> id:Sequence')
|
156 |
+
|
157 |
+
enz_str = st.text_input('', value="A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN")
|
158 |
+
|
159 |
+
# url = 'https://www.genome.jp/dbget-bin/www_bget?rn:R00801'
|
160 |
+
# if st.button('KEformat example'):
|
161 |
+
# webbrowser.open_new_tab(url)
|
162 |
+
|
163 |
+
st.subheader('Substrate ')
|
164 |
+
st.caption('Please follow the input format show in the text box--> KEGG id or click the checkbox')
|
165 |
+
|
166 |
+
comp_str = st.text_input('', value="C00149")
|
167 |
+
if st.checkbox('If you are entering smiles string along with KEGG ID'):
|
168 |
+
add_info = st.text_area('Additional information (id: Smiles):', "C00149:O[C@@H](CC([O-])=O)C([O-])=O")
|
169 |
+
else:
|
170 |
+
add_info = ''
|
171 |
+
|
172 |
+
if st.button("Predict"):
|
173 |
+
# if session_state.button_search:
|
174 |
+
# st.subheader('Enzyme-Substrate activity score')
|
175 |
+
with st.spinner('Calculating...'):
|
176 |
+
try:
|
177 |
+
# st.write('I am inside')
|
178 |
+
prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)
|
179 |
+
if len(add_info) == 0:
|
180 |
+
kegg_id_flag = 1
|
181 |
+
comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)
|
182 |
+
else:
|
183 |
+
kegg_id_flag = 0
|
184 |
+
comp_feature, comp_id = mol_feature_gen_from_str_input(add_info, kegg_id_flag, kegg_df)
|
185 |
+
|
186 |
+
act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)
|
187 |
+
# st.write(act_dataframe)
|
188 |
+
compound_feature = compound_feature_gen_df_input(act_dataframe, comp_feature)
|
189 |
+
# st.write(compound_feature)
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
st.write('Error somewhere...' + repr(e))
|
193 |
+
|
194 |
+
# st.write(compound_feature)
|
195 |
+
# st.write(prot_feature)
|
196 |
+
# keras.backend.clear_session()
|
197 |
+
|
198 |
+
y = ld_model.predict([compound_feature, prot_feature])
|
199 |
+
|
200 |
+
subheaderstring = 'EnzRank Score for '+ prot_id + '-' + comp_id + ' pair:'
|
201 |
+
st.subheader(subheaderstring)
|
202 |
+
st.write(str(y[0][0]))
|
203 |
+
|
204 |
+
if __name__ == '__main__':
|
205 |
+
main()
|