legend1234 commited on
Commit
0e95800
·
1 Parent(s): d05f89f

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (2) hide show
  1. b3clf/b3clf.py +43 -28
  2. b3clf/utils.py +14 -9
b3clf/b3clf.py CHANGED
@@ -31,26 +31,31 @@ import os
31
  import numpy as np
32
  from .descriptor_padel import compute_descriptors
33
  from .geometry_opt import geometry_optimize
34
- from .utils import (get_descriptors, predict_permeability,
35
- scale_descriptors, select_descriptors)
 
 
 
 
36
 
37
  __all__ = [
38
  "b3clf",
39
  ]
40
 
41
 
42
- def b3clf(mol_in,
43
- sep="\s+|\t+",
44
- clf="xgb",
45
- sampling="classic_ADASYN",
46
- output="B3clf_output.xlsx",
47
- verbose=1,
48
- random_seed=42,
49
- time_per_mol=-1,
50
- keep_features="no",
51
- keep_sdf="no",
52
- threshold="none",
53
- ):
 
54
  """Use B3clf for BBB classifications with resampling strategies.
55
 
56
  Parameters
@@ -110,12 +115,13 @@ def b3clf(mol_in,
110
 
111
  geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
112
 
113
- _ = compute_descriptors(sdf_file=internal_sdf,
114
- excel_out=features_out,
115
- output_csv=None,
116
- timeout=None,
117
- time_per_molecule=time_per_mol,
118
- )
 
119
 
120
  # Get computed descriptors
121
  X_features, info_df = get_descriptors(df=features_out)
@@ -131,16 +137,25 @@ def b3clf(mol_in,
131
  # clf = get_clf(clf_str=clf, sampling_str=sampling)
132
 
133
  # Get classifier
134
- result_df = predict_permeability(clf_str=clf,
135
- sampling_str=sampling,
136
- features_df=X_features,
137
- info_df=info_df,
138
- threshold=threshold)
 
 
139
 
140
  # Get classifier
141
- display_cols = ["ID", "SMILES", "B3clf_predicted_probability", "B3clf_predicted_label"]
142
-
143
- result_df = result_df[[col for col in result_df.columns.to_list() if col in display_cols]]
 
 
 
 
 
 
 
144
  if verbose != 0:
145
  print(result_df)
146
 
 
31
  import numpy as np
32
  from .descriptor_padel import compute_descriptors
33
  from .geometry_opt import geometry_optimize
34
+ from .utils import (
35
+ get_descriptors,
36
+ predict_permeability,
37
+ scale_descriptors,
38
+ select_descriptors,
39
+ )
40
 
41
  __all__ = [
42
  "b3clf",
43
  ]
44
 
45
 
46
+ def b3clf(
47
+ mol_in,
48
+ sep="\s+|\t+",
49
+ clf="xgb",
50
+ sampling="classic_ADASYN",
51
+ output="B3clf_output.xlsx",
52
+ verbose=1,
53
+ random_seed=42,
54
+ time_per_mol=-1,
55
+ keep_features="no",
56
+ keep_sdf="no",
57
+ threshold="none",
58
+ ):
59
  """Use B3clf for BBB classifications with resampling strategies.
60
 
61
  Parameters
 
115
 
116
  geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
117
 
118
+ _ = compute_descriptors(
119
+ sdf_file=internal_sdf,
120
+ excel_out=features_out,
121
+ output_csv=None,
122
+ timeout=None,
123
+ time_per_molecule=time_per_mol,
124
+ )
125
 
126
  # Get computed descriptors
127
  X_features, info_df = get_descriptors(df=features_out)
 
137
  # clf = get_clf(clf_str=clf, sampling_str=sampling)
138
 
139
  # Get classifier
140
+ result_df = predict_permeability(
141
+ clf_str=clf,
142
+ sampling_str=sampling,
143
+ mol_features=X_features,
144
+ info_df=info_df,
145
+ threshold=threshold,
146
+ )
147
 
148
  # Get classifier
149
+ display_cols = [
150
+ "ID",
151
+ "SMILES",
152
+ "B3clf_predicted_probability",
153
+ "B3clf_predicted_label",
154
+ ]
155
+
156
+ result_df = result_df[
157
+ [col for col in result_df.columns.to_list() if col in display_cols]
158
+ ]
159
  if verbose != 0:
160
  print(result_df)
161
 
b3clf/utils.py CHANGED
@@ -89,9 +89,9 @@ def scale_descriptors(df):
89
  dirname = os.path.dirname(__file__)
90
  filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
91
  b3db_scaler = load(filename)
92
- df.iloc[:, :] = b3db_scaler.transform(df)
93
 
94
- return df
95
 
96
 
97
  def get_clf(clf_str, sampling_str):
@@ -125,7 +125,9 @@ def get_clf(clf_str, sampling_str):
125
  return clf
126
 
127
 
128
- def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold="none"):
 
 
129
  """Compute and store BBB predicted label and predicted probability to results dataframe."""
130
 
131
  # load the threshold data
@@ -133,18 +135,21 @@ def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold=
133
  fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
134
  df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
135
  # default threshold is 0.5
136
- label_pool = np.zeros(features_df.shape[0], dtype=int)
137
 
138
  # get the classifier
139
  clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
140
 
141
- if features_df.index.tolist() != info_df.index.tolist():
142
- raise ValueError(
143
- "Features_df and Info_df do not have the same index. Internal processing error"
144
- )
 
145
 
146
  # get predicted probabilities
147
- info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(features_df)[:, 1]
 
 
148
  # get predicted label from probability using the threshold
149
  mask = np.greater_equal(
150
  info_df["B3clf_predicted_probability"].to_numpy(),
 
89
  dirname = os.path.dirname(__file__)
90
  filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
91
  b3db_scaler = load(filename)
92
+ df_new = b3db_scaler.transform(df)
93
 
94
+ return df_new
95
 
96
 
97
  def get_clf(clf_str, sampling_str):
 
125
  return clf
126
 
127
 
128
+ def predict_permeability(
129
+ clf_str, sampling_str, mol_features, info_df, threshold="none"
130
+ ):
131
  """Compute and store BBB predicted label and predicted probability to results dataframe."""
132
 
133
  # load the threshold data
 
135
  fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
136
  df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
137
  # default threshold is 0.5
138
+ label_pool = np.zeros(mol_features.shape[0], dtype=int)
139
 
140
  # get the classifier
141
  clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
142
 
143
+ if type(mol_features) == pd.DataFrame:
144
+ if mol_features.index.tolist() != info_df.index.tolist():
145
+ raise ValueError(
146
+ "Features_df and Info_df do not have the same index. Internal processing error"
147
+ )
148
 
149
  # get predicted probabilities
150
+ info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
151
+ :, 1
152
+ ]
153
  # get predicted label from probability using the threshold
154
  mask = np.greater_equal(
155
  info_df["B3clf_predicted_probability"].to_numpy(),