Upload main.py
Browse files
@@ -0,0 +1,205 @@
1 |
import streamlit as st
2 |
import pandas as pd
3 |
import numpy as np
4 |
import re
5 |
from PIL import Image
6 |
import webbrowser
7 |
8 |
from rdkit import Chem
9 |
from rdkit.Chem import AllChem
10 |
from rdkit.Chem import Draw
11 |
from rdkit.Chem import rdChemReactions as Reactions
12 |
13 |
import tensorflow as tf
14 |
from tensorflow import keras
15 |
from keras.preprocessing import sequence
16 |
from keras.utils import pad_sequences
17 |
import keras
18 |
from keras import backend as K
19 |
from keras.models import load_model
20 |
import argparse
21 |
import h5py
22 |
import pdb
23 |
24 |
25 |
seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M',
26 |
'S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']
27 |
seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}
28 |
29 |
30 |
31 |
def encodeSeq(seq, seq_dic):
32 |
if pd.isnull(seq):
33 |
return [0]
34 |
35 |
return [seq_dic[aa] for aa in seq]
36 |
37 |
38 |
39 |
def load_modelfile(model_string):
40 |
loaded_model = tf.keras.models.load_model(model_string)
41 |
return loaded_model
42 |
43 |
44 |
45 |
def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):
46 |
Prot_ID = prot_input_str.split(':')[0]
47 |
Prot_seq = prot_input_str.split(':')[1]
48 |
prot_dataframe = pd.DataFrame(
49 |
{'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])
50 |
51 |
52 |
prot_dataframe["encoded_sequence"] = prot_dataframe.Sequence.map(
53 |
lambda a: encodeSeq(a, seq_dic))
54 |
prot_feature = pad_sequences(
55 |
prot_dataframe["encoded_sequence"].values, prot_len)
56 |
57 |
return prot_feature, Prot_ID
58 |
59 |
60 |
61 |
def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):
62 |
63 |
if kegg_id_flag == 1:
64 |
KEGG_ID = mol_str
65 |
kegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]
66 |
KEGG_ID_info = kegg_df.loc[kegg_id_loc]
67 |
KEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')
68 |
69 |
final_return = KEGG_ID_info_df
70 |
final_id = KEGG_ID
71 |
72 |
73 |
74 |
mol_ID = mol_str.split(':')[0]
75 |
mol_smiles = mol_str.split(':')[1]
76 |
mol = Chem.MolFromSmiles(mol_smiles)
77 |
fp1 = AllChem.GetMorganFingerprintAsBitVect(
78 |
mol, useChirality=True, radius=2, nBits=2048)
79 |
fp_list = list(np.array(fp1).astype(float))
80 |
fp_str = list(map(str, fp_list))
81 |
mol_fp = '\t'.join(fp_str)
82 |
83 |
mol_dict = {}
84 |
mol_dict['Compound_ID'] = mol_ID
85 |
mol_dict['Smiles'] = mol_smiles
86 |
mol_dict['morgan_fp_r2'] = mol_fp
87 |
88 |
mol_info_df = pd.DataFrame(mol_dict, index=[0])
89 |
mol_info_df = mol_info_df.set_index('Compound_ID')
90 |
91 |
final_return = mol_info_df
92 |
final_id = mol_ID
93 |
94 |
except Exception as error:
95 |
print('Something wrong with molecule input string...' + repr(error))
96 |
97 |
return final_return, final_id
98 |
99 |
100 |
101 |
def act_df_gen_mol_feature(mol_id, prot_id):
102 |
act_df = pd.DataFrame(
103 |
{'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])
104 |
105 |
return act_df
106 |
107 |
108 |
109 |
def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):
110 |
act_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)
111 |
comp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split("\t")))
112 |
comp_feature = comp_feature.astype('float')
113 |
return comp_feature
114 |
115 |
116 |
117 |
def model_prediction(compound_feature, enz_feature, model):
118 |
prediction_vals = model.predict([compound_feature, enz_feature])
119 |
120 |
return prediction_vals[0][0]
121 |
122 |
123 |
# loaded_model = load_modelfile('./../CNN_results/model_final.model')
124 |
125 |
# KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
126 |
# kegg_df = KEGG_compound_read.reset_index()
127 |
128 |
129 |
def main():
130 |
graph = tf.compat.v1.get_default_graph()
131 |
ld_model = tf.keras.models.load_model('./../CNN_results_split_final/Final_model.model')
132 |
133 |
KEGG_compound_read = pd.read_csv('./../CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
134 |
kegg_df = KEGG_compound_read.reset_index()
135 |
136 |
137 |
# def img_to_bytes(img_path):
138 |
# img_bytes = Path(img_path).read_bytes()
139 |
# encoded = base64.b64encode(img_bytes).decode()
140 |
# return encoded
141 |
# # st.title('dGPredictor')
142 |
143 |
# header_html = "<img src='../figures/header.png'>"
144 |
145 |
# st.markdown(
146 |
# header_html, unsafe_allow_html=True,
147 |
# )
148 |
149 |
150 |
st.image('./header.png', use_column_width=True)
151 |
152 |
st.subheader('Enzyme-Substrate Activity Predictor ')
153 |
154 |
st.subheader('Enzyme sequence')
155 |
st.caption('Please follow the input format show in the text box--> id:Sequence')
156 |
157 |
158 |
159 |
# url = 'https://www.genome.jp/dbget-bin/www_bget?rn:R00801'
160 |
# if st.button('KEformat example'):
161 |
# webbrowser.open_new_tab(url)
162 |
163 |
st.subheader('Substrate ')
164 |
st.caption('Please follow the input format show in the text box--> KEGG id or click the checkbox')
165 |
166 |
comp_str = st.text_input('', value="C00149")
167 |
if st.checkbox('If you are entering smiles string along with KEGG ID'):
168 |
add_info = st.text_area('Additional information (id: Smiles):', "C00149:O[C@@H](CC([O-])=O)C([O-])=O")
169 |
170 |
add_info = ''
171 |
172 |
if st.button("Predict"):
173 |
# if session_state.button_search:
174 |
# st.subheader('Enzyme-Substrate activity score')
175 |
with st.spinner('Calculating...'):
176 |
177 |
# st.write('I am inside')
178 |
prot_feature, prot_id = prot_feature_gen_from_str_input(enz_str)
179 |
if len(add_info) == 0:
180 |
kegg_id_flag = 1
181 |
comp_feature, comp_id = mol_feature_gen_from_str_input(comp_str, kegg_id_flag, kegg_df)
182 |
183 |
kegg_id_flag = 0
184 |
comp_feature, comp_id = mol_feature_gen_from_str_input(add_info, kegg_id_flag, kegg_df)
185 |
186 |
act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)
187 |
# st.write(act_dataframe)
188 |
compound_feature = compound_feature_gen_df_input(act_dataframe, comp_feature)
189 |
# st.write(compound_feature)
190 |
191 |
except Exception as e:
192 |
st.write('Error somewhere...' + repr(e))
193 |
194 |
# st.write(compound_feature)
195 |
# st.write(prot_feature)
196 |
# keras.backend.clear_session()
197 |
198 |
y = ld_model.predict([compound_feature, prot_feature])
199 |
200 |
subheaderstring = 'EnzRank Score for '+ prot_id + '-' + comp_id + ' pair:'
201 |
202 |
203 |
204 |
if __name__ == '__main__':
205 |