vuu10 commited on
Commit
0024411
·
1 Parent(s): 49908a6

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +205 -0
main.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ from PIL import Image
6
+ import webbrowser
7
+
8
+ from rdkit import Chem
9
+ from rdkit.Chem import AllChem
10
+ from rdkit.Chem import Draw
11
+ from rdkit.Chem import rdChemReactions as Reactions
12
+
13
+ import tensorflow as tf
14
+ from tensorflow import keras
15
+ from keras.preprocessing import sequence
16
+ from keras.utils import pad_sequences
17
+ import keras
18
+ from keras import backend as K
19
+ from keras.models import load_model
20
+ import argparse
21
+ import h5py
22
+ import pdb
23
+
24
+
25
+ seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M',
26
+ 'S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']
27
+ seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}
28
+
29
+
30
+ @st.cache(allow_output_mutation=True)
31
+ def encodeSeq(seq, seq_dic):
32
+ if pd.isnull(seq):
33
+ return [0]
34
+ else:
35
+ return [seq_dic[aa] for aa in seq]
36
+
37
+
38
+ @st.cache(allow_output_mutation=True)
39
+ def load_modelfile(model_string):
40
+ loaded_model = tf.keras.models.load_model(model_string)
41
+ return loaded_model
42
+
43
+
44
+ @st.cache(allow_output_mutation=True)
45
+ def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):
46
+ Prot_ID = prot_input_str.split(':')[0]
47
+ Prot_seq = prot_input_str.split(':')[1]
48
+ prot_dataframe = pd.DataFrame(
49
+ {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])
50
+ prot_dataframe.set_index('Protein_ID')
51
+
52
+ prot_dataframe["encoded_sequence"] = prot_dataframe.Sequence.map(
53
+ lambda a: encodeSeq(a, seq_dic))
54
+ prot_feature = pad_sequences(
55
+ prot_dataframe["encoded_sequence"].values, prot_len)
56
+
57
+ return prot_feature, Prot_ID
58
+
59
+
60
+ @st.cache(allow_output_mutation=True)
61
+ def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):
62
+
63
+ if kegg_id_flag == 1:
64
+ KEGG_ID = mol_str
65
+ kegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]
66
+ KEGG_ID_info = kegg_df.loc[kegg_id_loc]
67
+ KEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')
68
+
69
+ final_return = KEGG_ID_info_df
70
+ final_id = KEGG_ID
71
+
72
+ else:
73
+ try:
74
+ mol_ID = mol_str.split(':')[0]
75
+ mol_smiles = mol_str.split(':')[1]
76
+ mol = Chem.MolFromSmiles(mol_smiles)
77
+ fp1 = AllChem.GetMorganFingerprintAsBitVect(
78
+ mol, useChirality=True, radius=2, nBits=2048)
79
+ fp_list = list(np.array(fp1).astype(float))
80
+ fp_str = list(map(str, fp_list))
81
+ mol_fp = '\t'.join(fp_str)
82
+
83
+ mol_dict = {}
84
+ mol_dict['Compound_ID'] = mol_ID
85
+ mol_dict['Smiles'] = mol_smiles
86
+ mol_dict['morgan_fp_r2'] = mol_fp
87
+
88
+ mol_info_df = pd.DataFrame(mol_dict, index=[0])
89
+ mol_info_df = mol_info_df.set_index('Compound_ID')
90
+
91
+ final_return = mol_info_df
92
+ final_id = mol_ID
93
+
94
+ except Exception as error:
95
+ print('Something wrong with molecule input string...' + repr(error))
96
+
97
+ return final_return, final_id
98
+
99
+
100
+ @st.cache(allow_output_mutation=True)
101
+ def act_df_gen_mol_feature(mol_id, prot_id):
102
+ act_df = pd.DataFrame(
103
+ {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])
104
+
105
+ return act_df
106
+
107
+
108
+ @st.cache(allow_output_mutation=True)
109
+ def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):
110
+ act_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)
111
+ comp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split("\t")))
112
+ comp_feature = comp_feature.astype('float')
113
+ return comp_feature
114
+
115
+
116
+ @st.cache(allow_output_mutation=True)
117
+ def model_prediction(compound_feature, enz_feature, model):
118
+ prediction_vals = model.predict([compound_feature, enz_feature])
119
+
120
+ return prediction_vals[0][0]
121
+
122
+
123
+ # loaded_model = load_modelfile('./../CNN_results/model_final.model')
124
+
125
+ # KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
126
+ # kegg_df = KEGG_compound_read.reset_index()
127
+
128
+
129
+ def main():
130
+ graph = tf.compat.v1.get_default_graph()
131
+ ld_model = tf.keras.models.load_model('./../CNN_results_split_final/Final_model.model')
132
+
133
+ KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
134
+ kegg_df = KEGG_compound_read.reset_index()
135
+
136
+
137
+ # def img_to_bytes(img_path):
138
+ # img_bytes = Path(img_path).read_bytes()
139
+ # encoded = base64.b64encode(img_bytes).decode()
140
+ # return encoded
141
+ # # st.title('dGPredictor')
142
+
143
+ # header_html = "<img src='../figures/header.png'>"
144
+
145
+ # st.markdown(
146
+ # header_html, unsafe_allow_html=True,
147
+ # )
148
+
149
+
150
+ st.image('./header.png', use_column_width=True)
151
+
152
+ st.subheader('Enzyme-Substrate Activity Predictor ')
153
+
154
+ st.subheader('Enzyme sequence')
155
+ st.caption('Please follow the input format show in the text box--> id:Sequence')
156
+
157
+ enz_str = st.text_input('', value="A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN")
158
+
159
+ # url = 'https://www.genome.jp/dbget-bin/www_bget?rn:R00801'
160
+ # if st.button('KEformat example'):
161
+ # webbrowser.open_new_tab(url)
162
+
163
+ st.subheader('Substrate ')
164
+ st.caption('Please follow the input format show in the text box--> KEGG id or click the checkbox')
165
+
166
+ comp_str = st.text_input('', value="C00149")
167
+ if st.checkbox('If you are entering smiles string along with KEGG ID'):
168
+ add_info = st.text_area('Additional information (id: Smiles):', "C00149:O[C@@H](CC([O-])=O)C([O-])=O")
169
+ else:
170
+ add_info = ''
171
+
172
+ if st.button("Predict"):
173
+ # if session_state.button_search:
174
+ # st.subheader('Enzyme-Substrate activity score')
175
+ with st.spinner('Calculating...'):
176
+ try:
177
+ # st.write('I am inside')
178
+ prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)
179
+ if len(add_info) == 0:
180
+ kegg_id_flag = 1
181
+ comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)
182
+ else:
183
+ kegg_id_flag = 0
184
+ comp_feature, comp_id = mol_feature_gen_from_str_input(add_info, kegg_id_flag, kegg_df)
185
+
186
+ act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)
187
+ # st.write(act_dataframe)
188
+ compound_feature = compound_feature_gen_df_input(act_dataframe, comp_feature)
189
+ # st.write(compound_feature)
190
+
191
+ except Exception as e:
192
+ st.write('Error somewhere...' + repr(e))
193
+
194
+ # st.write(compound_feature)
195
+ # st.write(prot_feature)
196
+ # keras.backend.clear_session()
197
+
198
+ y = ld_model.predict([compound_feature, prot_feature])
199
+
200
+ subheaderstring = 'EnzRank Score for '+ prot_id + '-' + comp_id + ' pair:'
201
+ st.subheader(subheaderstring)
202
+ st.write(str(y[0][0]))
203
+
204
+ if __name__ == '__main__':
205
+ main()