legend1234
commited on
Commit
·
0e95800
1
Parent(s):
d05f89f
Synced repo using 'sync_with_huggingface' Github Action
Browse files- b3clf/b3clf.py +43 -28
- b3clf/utils.py +14 -9
b3clf/b3clf.py
CHANGED
@@ -31,26 +31,31 @@ import os
|
|
31 |
import numpy as np
|
32 |
from .descriptor_padel import compute_descriptors
|
33 |
from .geometry_opt import geometry_optimize
|
34 |
-
from .utils import (
|
35 |
-
|
|
|
|
|
|
|
|
|
36 |
|
37 |
__all__ = [
|
38 |
"b3clf",
|
39 |
]
|
40 |
|
41 |
|
42 |
-
def b3clf(
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
54 |
"""Use B3clf for BBB classifications with resampling strategies.
|
55 |
|
56 |
Parameters
|
@@ -110,12 +115,13 @@ def b3clf(mol_in,
|
|
110 |
|
111 |
geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
|
112 |
|
113 |
-
_ = compute_descriptors(
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
119 |
|
120 |
# Get computed descriptors
|
121 |
X_features, info_df = get_descriptors(df=features_out)
|
@@ -131,16 +137,25 @@ def b3clf(mol_in,
|
|
131 |
# clf = get_clf(clf_str=clf, sampling_str=sampling)
|
132 |
|
133 |
# Get classifier
|
134 |
-
result_df = predict_permeability(
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
139 |
|
140 |
# Get classifier
|
141 |
-
display_cols = [
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
if verbose != 0:
|
145 |
print(result_df)
|
146 |
|
|
|
31 |
import numpy as np
|
32 |
from .descriptor_padel import compute_descriptors
|
33 |
from .geometry_opt import geometry_optimize
|
34 |
+
from .utils import (
|
35 |
+
get_descriptors,
|
36 |
+
predict_permeability,
|
37 |
+
scale_descriptors,
|
38 |
+
select_descriptors,
|
39 |
+
)
|
40 |
|
41 |
__all__ = [
|
42 |
"b3clf",
|
43 |
]
|
44 |
|
45 |
|
46 |
+
def b3clf(
|
47 |
+
mol_in,
|
48 |
+
sep="\s+|\t+",
|
49 |
+
clf="xgb",
|
50 |
+
sampling="classic_ADASYN",
|
51 |
+
output="B3clf_output.xlsx",
|
52 |
+
verbose=1,
|
53 |
+
random_seed=42,
|
54 |
+
time_per_mol=-1,
|
55 |
+
keep_features="no",
|
56 |
+
keep_sdf="no",
|
57 |
+
threshold="none",
|
58 |
+
):
|
59 |
"""Use B3clf for BBB classifications with resampling strategies.
|
60 |
|
61 |
Parameters
|
|
|
115 |
|
116 |
geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
|
117 |
|
118 |
+
_ = compute_descriptors(
|
119 |
+
sdf_file=internal_sdf,
|
120 |
+
excel_out=features_out,
|
121 |
+
output_csv=None,
|
122 |
+
timeout=None,
|
123 |
+
time_per_molecule=time_per_mol,
|
124 |
+
)
|
125 |
|
126 |
# Get computed descriptors
|
127 |
X_features, info_df = get_descriptors(df=features_out)
|
|
|
137 |
# clf = get_clf(clf_str=clf, sampling_str=sampling)
|
138 |
|
139 |
# Get classifier
|
140 |
+
result_df = predict_permeability(
|
141 |
+
clf_str=clf,
|
142 |
+
sampling_str=sampling,
|
143 |
+
mol_features=X_features,
|
144 |
+
info_df=info_df,
|
145 |
+
threshold=threshold,
|
146 |
+
)
|
147 |
|
148 |
# Get classifier
|
149 |
+
display_cols = [
|
150 |
+
"ID",
|
151 |
+
"SMILES",
|
152 |
+
"B3clf_predicted_probability",
|
153 |
+
"B3clf_predicted_label",
|
154 |
+
]
|
155 |
+
|
156 |
+
result_df = result_df[
|
157 |
+
[col for col in result_df.columns.to_list() if col in display_cols]
|
158 |
+
]
|
159 |
if verbose != 0:
|
160 |
print(result_df)
|
161 |
|
b3clf/utils.py
CHANGED
@@ -89,9 +89,9 @@ def scale_descriptors(df):
|
|
89 |
dirname = os.path.dirname(__file__)
|
90 |
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
|
91 |
b3db_scaler = load(filename)
|
92 |
-
|
93 |
|
94 |
-
return
|
95 |
|
96 |
|
97 |
def get_clf(clf_str, sampling_str):
|
@@ -125,7 +125,9 @@ def get_clf(clf_str, sampling_str):
|
|
125 |
return clf
|
126 |
|
127 |
|
128 |
-
def predict_permeability(
|
|
|
|
|
129 |
"""Compute and store BBB predicted label and predicted probability to results dataframe."""
|
130 |
|
131 |
# load the threshold data
|
@@ -133,18 +135,21 @@ def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold=
|
|
133 |
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
|
134 |
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
|
135 |
# default threshold is 0.5
|
136 |
-
label_pool = np.zeros(
|
137 |
|
138 |
# get the classifier
|
139 |
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
|
140 |
|
141 |
-
if
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
145 |
|
146 |
# get predicted probabilities
|
147 |
-
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(
|
|
|
|
|
148 |
# get predicted label from probability using the threshold
|
149 |
mask = np.greater_equal(
|
150 |
info_df["B3clf_predicted_probability"].to_numpy(),
|
|
|
89 |
dirname = os.path.dirname(__file__)
|
90 |
filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
|
91 |
b3db_scaler = load(filename)
|
92 |
+
df_new = b3db_scaler.transform(df)
|
93 |
|
94 |
+
return df_new
|
95 |
|
96 |
|
97 |
def get_clf(clf_str, sampling_str):
|
|
|
125 |
return clf
|
126 |
|
127 |
|
128 |
+
def predict_permeability(
|
129 |
+
clf_str, sampling_str, mol_features, info_df, threshold="none"
|
130 |
+
):
|
131 |
"""Compute and store BBB predicted label and predicted probability to results dataframe."""
|
132 |
|
133 |
# load the threshold data
|
|
|
135 |
fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
|
136 |
df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
|
137 |
# default threshold is 0.5
|
138 |
+
label_pool = np.zeros(mol_features.shape[0], dtype=int)
|
139 |
|
140 |
# get the classifier
|
141 |
clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
|
142 |
|
143 |
+
if type(mol_features) == pd.DataFrame:
|
144 |
+
if mol_features.index.tolist() != info_df.index.tolist():
|
145 |
+
raise ValueError(
|
146 |
+
"Features_df and Info_df do not have the same index. Internal processing error"
|
147 |
+
)
|
148 |
|
149 |
# get predicted probabilities
|
150 |
+
info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
|
151 |
+
:, 1
|
152 |
+
]
|
153 |
# get predicted label from probability using the threshold
|
154 |
mask = np.greater_equal(
|
155 |
info_df["B3clf_predicted_probability"].to_numpy(),
|