sgbaird commited on
Commit
58815da
·
1 Parent(s): b8fce21

Refactor evaluate function in app.py to include parameter scaling and unscaled evaluation

Browse files
Files changed (1) hide show
  1. train_surrogate.py +320 -3
train_surrogate.py CHANGED
@@ -1,3 +1,320 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d70119e59352312f64ab5b620d6f1ccc62616af7ed03ab3efa09ac49b814c019
3
- size 10257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import joblib
3
+ from os import path
4
+ from pathlib import Path
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+
9
+ # from joblib import Parallel, delayed
10
+
11
+ from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
12
+ from sklearn.metrics import mean_squared_error
13
+ from sklearn.model_selection import RandomizedSearchCV
14
+ from sklearn.model_selection import KFold
15
+
16
+ from scipy.stats import uniform, randint
17
+
18
+ model_type = "hgbr" # "hgbr" or "rfr"
19
+ optimize_hyperparameters = True
20
+ dummy = False
21
+ n_jobs = -1 # Number of jobs to run in parallel. -1 means using all processors.
22
+
23
+ data_dir = "."
24
+ model_dir = "models"
25
+
26
+ assert model_type in [
27
+ "hgbr",
28
+ "rfr",
29
+ ], f"Invalid model type: {model_type}, must be 'hgbr' or 'rfr'"
30
+
31
+ if dummy:
32
+ model_dir = path.join(model_dir, "dummy")
33
+
34
+ Path(model_dir).mkdir(exist_ok=True, parents=True)
35
+
36
+ sobol_reg = pd.read_csv(path.join(data_dir, "sobol_regression.csv"))
37
+
38
+ if dummy:
39
+ data_dir = path.join(data_dir, "dummy")
40
+ sobol_reg = sobol_reg.head(100)
41
+
42
+ Path(data_dir).mkdir(exist_ok=True, parents=True)
43
+
44
+ elemprop_ohe = pd.get_dummies(sobol_reg["elem_prop"], prefix="elem_prop")
45
+ hardware_ohe = pd.get_dummies(sobol_reg["hardware"], prefix="hardware")
46
+
47
+ sobol_reg["use_RobustL1"] = sobol_reg["criterion"] == "RobustL1"
48
+
49
+ sobol_reg["bias"] = sobol_reg["bias"].astype(int)
50
+
51
+ sobol_reg = pd.concat([sobol_reg, elemprop_ohe], axis=1)
52
+
53
+ common_features = [
54
+ "N",
55
+ "alpha",
56
+ "d_model",
57
+ "dim_feedforward",
58
+ "dropout",
59
+ "emb_scaler",
60
+ "eps",
61
+ "epochs_step",
62
+ "fudge",
63
+ "heads",
64
+ "k",
65
+ "lr",
66
+ "pe_resolution",
67
+ "ple_resolution",
68
+ "pos_scaler",
69
+ "weight_decay",
70
+ "batch_size",
71
+ "out_hidden4",
72
+ "betas1",
73
+ "betas2",
74
+ "train_frac",
75
+ "bias",
76
+ "use_RobustL1",
77
+ "elem_prop_magpie",
78
+ "elem_prop_mat2vec",
79
+ "elem_prop_onehot",
80
+ ]
81
+
82
+
83
+ mae_features = common_features + ["mae_rank"]
84
+ X_array_mae = sobol_reg[mae_features]
85
+ y_array_mae = sobol_reg[["mae"]]
86
+ mae_model_stem = path.join(model_dir, "sobol_reg_mae")
87
+
88
+ rmse_features = common_features + ["rmse_rank"]
89
+ X_array_rmse = sobol_reg[rmse_features]
90
+ y_array_rmse = sobol_reg[["rmse"]]
91
+ rmse_model_stem = path.join(model_dir, "sobol_reg_rmse")
92
+
93
+ # no model_size_rank because model_size is deterministic via
94
+ # `crabnet.utils.utils.count_parameters`
95
+ model_size_features = common_features
96
+ X_array_model_size = sobol_reg[model_size_features]
97
+ y_array_model_size = sobol_reg[["model_size"]]
98
+ model_size_model_stem = path.join(model_dir, "sobol_reg_model_size")
99
+
100
+ runtime_features = common_features + ["runtime_rank"]
101
+ X_array_runtime = sobol_reg[runtime_features]
102
+ y_array_runtime = sobol_reg[["runtime"]]
103
+ runtime_model_stem = path.join(model_dir, "sobol_reg_runtime")
104
+
105
+
106
+ def train_and_save(
107
+ sr_feat_array,
108
+ sr_labels_array,
109
+ sr_label_names,
110
+ optimize_hyperparameters=False,
111
+ ):
112
+ models = {}
113
+ timings = {}
114
+ # cv_scores = []
115
+ avg_cv_scores = {}
116
+ cv_predictions = {}
117
+
118
+ for X1, y1, name1 in zip(sr_feat_array, sr_labels_array, sr_label_names):
119
+ y1 = y1.squeeze()
120
+ print(f"X1 sr shape: {X1.shape}, Y1 sr shape: {y1.shape}")
121
+
122
+ if model_type == "rfr":
123
+ model = RandomForestRegressor(random_state=13)
124
+ elif model_type == "hgbr":
125
+ model = HistGradientBoostingRegressor(random_state=13)
126
+
127
+ if optimize_hyperparameters:
128
+ # define hyperparameters to tune
129
+ if model.__class__.__name__ == "HistGradientBoostingRegressor":
130
+ param_dist = {
131
+ "max_iter": randint(100, 200),
132
+ "max_leaf_nodes": [None, 30, 50],
133
+ "learning_rate": uniform(0.01, 0.1),
134
+ # Add more hyperparameters here as needed
135
+ }
136
+ elif model.__class__.__name__ == "RandomForestRegressor":
137
+ param_dist = {
138
+ "n_estimators": randint(100, 200),
139
+ "max_features": ["auto", "sqrt"],
140
+ "max_depth": randint(10, 50),
141
+ "min_samples_split": randint(2, 10),
142
+ # Add more hyperparameters here as needed
143
+ }
144
+
145
+ # Use RandomizedSearchCV to tune the hyperparameters
146
+ random_search = RandomizedSearchCV(
147
+ model,
148
+ param_dist,
149
+ n_iter=10,
150
+ cv=5,
151
+ scoring="neg_mean_squared_error",
152
+ random_state=13,
153
+ n_jobs=n_jobs,
154
+ )
155
+
156
+ start_time = time.time()
157
+ # REVIEW: use y1.values.ravel() instead of y1 to flatten y1 to a 1D array
158
+ random_search.fit(X1, y1)
159
+ end_time = time.time()
160
+
161
+ # Use the best estimator found by RandomizedSearchCV
162
+ model = random_search.best_estimator_
163
+ timings[name1] = end_time - start_time
164
+ else:
165
+ start_time = time.time()
166
+ model.fit(X1, y1)
167
+ end_time = time.time()
168
+ timings[name1] = end_time - start_time
169
+
170
+ print(f"Trained {name1} in {timings[name1]} seconds")
171
+
172
+ # Perform cross-validation manually to keep track of predictions
173
+ # NOTE: This doesn't use GroupKFold, which would prevent cross-leakage for the rank column
174
+ # cv = KFold(n_splits=5)
175
+ # cv_preds = []
176
+ # for train_index, test_index in cv.split(X1):
177
+ # X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
178
+ # y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
179
+ # model.fit(X_train, y_train)
180
+ # preds = model.predict(X_test)
181
+ # cv_preds.extend(preds)
182
+ # cv_scores.append(mean_squared_error(y_test, preds))
183
+ # avg_cv_scores[name1] = np.sqrt(np.mean(cv_scores))
184
+ # cv_predictions[name1] = cv_preds
185
+
186
+ def cross_validate(X1, y1, model):
187
+ cv = KFold(n_splits=5)
188
+ cv_preds = []
189
+ cv_scores = []
190
+ for train_index, test_index in cv.split(X1):
191
+ X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
192
+ y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
193
+ model.fit(X_train, y_train)
194
+ preds = model.predict(X_test)
195
+ cv_preds.extend(preds)
196
+ cv_scores.append(mean_squared_error(y_test, preds))
197
+ return cv_preds, np.sqrt(np.mean(cv_scores))
198
+
199
+ cv_predictions[name1], avg_cv_scores[name1] = cross_validate(X1, y1, model)
200
+
201
+ # # Parallelize the outer loop
202
+ # results = Parallel(n_jobs=n_jobs)(
203
+ # delayed(cross_validate)(X1, y1, model)
204
+ # for X1, y1 in zip(sr_feat_array, sr_labels_array)
205
+ # )
206
+
207
+ # # Unpack the results
208
+ # cv_predictions, avg_cv_scores = zip(*results)
209
+
210
+ # # Convert the results to dictionaries
211
+ # cv_predictions = dict(zip(sobol_reg_target_names, cv_predictions))
212
+ # avg_cv_scores = dict(zip(sobol_reg_target_names, avg_cv_scores))
213
+
214
+ print(f"Cross-validated score for {name1}: {avg_cv_scores[name1]}")
215
+
216
+ models[name1] = model
217
+
218
+ print()
219
+
220
+ return models, timings, avg_cv_scores, cv_predictions
221
+
222
+
223
+ # List of x_arrays, y_arrays, and target_names
224
+ sobol_reg_x_arrays = [X_array_mae, X_array_rmse, X_array_model_size, X_array_runtime]
225
+ sobol_reg_labels = [y_array_mae, y_array_rmse, y_array_model_size, y_array_runtime]
226
+ sobol_reg_target_names = ["mae", "rmse", "model_size", "runtime"]
227
+
228
+ # Train and save the model on all the data
229
+ models, timings, avg_cv_scores, cv_predictions = train_and_save(
230
+ sobol_reg_x_arrays,
231
+ sobol_reg_labels,
232
+ sobol_reg_target_names,
233
+ optimize_hyperparameters=optimize_hyperparameters, # if true, probably ~16 min for iter=5 & cv=3
234
+ )
235
+
236
+ print(f"Timings (in seconds): {timings}") # doesn't include cross_val_score runtime
237
+ print(f"Cross-validated scores: {avg_cv_scores}")
238
+
239
+ # Save timings and cv_scores to a CSV file
240
+ results = pd.DataFrame(
241
+ {
242
+ "Model": list(timings.keys()),
243
+ "Timing": list(timings.values()),
244
+ "CV Score": list(avg_cv_scores.values()),
245
+ }
246
+ )
247
+
248
+ # Determine the model type and optimization status
249
+ model_type = (
250
+ "hgbr"
251
+ if isinstance(next(iter(models.values())), HistGradientBoostingRegressor)
252
+ else "rfr"
253
+ )
254
+ opt_status = "opt" if optimize_hyperparameters else "no_opt"
255
+
256
+ # Save the results and models with the updated filenames
257
+ results_filename = f"model_results_{model_type}_{opt_status}.csv"
258
+ models_filename = f"surrogate_models_{model_type}_{opt_status}.pkl"
259
+
260
+ results.to_csv(path.join(model_dir, results_filename), index=False)
261
+ joblib.dump(models, path.join(model_dir, models_filename), compress=7)
262
+
263
+ # NOTE: Can use this if looking at how well it memorizes the training data
264
+ # # Generate predictions for each model
265
+ # predictions = {
266
+ # name: model.predict(X)
267
+ # for name, model, X in zip(
268
+ # sobol_reg_target_names, models.values(), sobol_reg_x_arrays
269
+ # )
270
+ # }
271
+
272
+ # Create a 2x2 grid of subplots
273
+ fig, axs = plt.subplots(2, 2, figsize=(8, 8))
274
+
275
+ # Flatten the axs array for easy iteration
276
+ axs = axs.flatten()
277
+
278
+ for ax, name in zip(axs, sobol_reg_target_names):
279
+ # Get the true and predicted values for this model
280
+ true_values = sobol_reg[name]
281
+ predicted_values = cv_predictions[name]
282
+
283
+ # Create the hexbin plot with log scaling
284
+ hb = ax.hexbin(
285
+ true_values, predicted_values, gridsize=50, cmap="viridis", bins="log"
286
+ )
287
+ cb = plt.colorbar(hb, ax=ax)
288
+ cb.set_label("counts (log scale)")
289
+
290
+ ax.plot(
291
+ [true_values.min(), true_values.max()],
292
+ [true_values.min(), true_values.max()],
293
+ "w--",
294
+ )
295
+ ax.set_xlabel("True Values")
296
+ ax.set_ylabel("Predicted Values")
297
+ ax.set_title(f"Parity Plot for {name}")
298
+
299
+ # Set the aspect ratio to be equal
300
+ ax.set_aspect("equal")
301
+
302
+ # Adjust the layout and show the plot
303
+ plt.tight_layout()
304
+
305
+ # Save the plot with the updated filename
306
+ plot_filename = f"parity_plot_{model_type}_{opt_status}.png"
307
+ plt.savefig(path.join(model_dir, plot_filename), dpi=300)
308
+
309
+ plt.show()
310
+
311
+ 1 + 1
312
+
313
+
314
+ # %% Code Graveyard
315
+
316
+ # # Compute cross-validated score
317
+ # cv_score = cross_val_score(
318
+ # model, X1, y1, cv=5, scoring="neg_mean_squared_error"
319
+ # )
320
+ # cv_scores[name1] = np.sqrt(np.abs(cv_score.mean()))