Spaces:
Runtime error
Runtime error
add python files from official repo
Browse files- example-scripts +0 -1
- example_model_advanced.py +296 -0
- utils.py +312 -0
example-scripts
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Subproject commit 838bfd1788feaf40362d6bedb3e4683832a9dbb1
|
|
|
|
example_model_advanced.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from lightgbm import LGBMRegressor
|
3 |
+
import gc
|
4 |
+
from numerapi import NumerAPI
|
5 |
+
from pathlib import Path
|
6 |
+
from utils import (
|
7 |
+
save_model,
|
8 |
+
load_model,
|
9 |
+
neutralize,
|
10 |
+
get_biggest_change_features,
|
11 |
+
get_time_series_cross_val_splits,
|
12 |
+
validation_metrics,
|
13 |
+
load_model_config,
|
14 |
+
save_model_config,
|
15 |
+
save_prediction,
|
16 |
+
TARGET_COL,
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
+
EXAMPLE_PREDS_COL = "example_preds"
|
21 |
+
ERA_COL = "era"
|
22 |
+
# params we'll use to train all of our models.
|
23 |
+
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
|
24 |
+
model_params = {"n_estimators": 2000,
|
25 |
+
"learning_rate": 0.01,
|
26 |
+
"max_depth": 5,
|
27 |
+
"num_leaves": 2 ** 5,
|
28 |
+
"colsample_bytree": 0.1}
|
29 |
+
|
30 |
+
# the amount of downsampling we'll use to speed up cross validation and full train.
|
31 |
+
# a value of 1 means no downsampling
|
32 |
+
# a value of 10 means use every 10th row
|
33 |
+
downsample_cross_val = 20
|
34 |
+
downsample_full_train = 2
|
35 |
+
|
36 |
+
# if model_selection_loop=True get OOS performance for training_data
|
37 |
+
# and use that to select best model
|
38 |
+
# if model_selection_loop=False, just predict on tournament data using existing models and model config
|
39 |
+
model_selection_loop = True
|
40 |
+
model_config_name = "advanced_example_model"
|
41 |
+
|
42 |
+
napi = NumerAPI()
|
43 |
+
|
44 |
+
current_round = napi.get_current_round()
|
45 |
+
|
46 |
+
Path("./v4").mkdir(parents=False, exist_ok=True)
|
47 |
+
napi.download_dataset("v4/train.parquet")
|
48 |
+
napi.download_dataset("v4/features.json")
|
49 |
+
|
50 |
+
|
51 |
+
print("Entering model selection loop. This may take awhile.")
|
52 |
+
if model_selection_loop:
|
53 |
+
model_config = {}
|
54 |
+
print('reading training_data')
|
55 |
+
training_data = pd.read_parquet('v4/train.parquet')
|
56 |
+
|
57 |
+
# keep track of some prediction columns
|
58 |
+
ensemble_cols = set()
|
59 |
+
pred_cols = set()
|
60 |
+
|
61 |
+
# pick some targets to use
|
62 |
+
possible_targets = [c for c in training_data.columns if c.startswith("target_")]
|
63 |
+
# randomly pick a handful of targets
|
64 |
+
# this can be vastly improved
|
65 |
+
targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]
|
66 |
+
|
67 |
+
# all the possible features to train on
|
68 |
+
feature_cols = [c for c in training_data if c.startswith("feature_")]
|
69 |
+
|
70 |
+
""" do cross val to get out of sample training preds"""
|
71 |
+
cv = 3
|
72 |
+
train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
|
73 |
+
# get out of sample training preds via embargoed time series cross validation
|
74 |
+
# optionally downsample training data to speed up this section.
|
75 |
+
print("entering time series cross validation loop")
|
76 |
+
for split, train_test_split in enumerate(train_test_zip):
|
77 |
+
gc.collect()
|
78 |
+
print(f"doing split {split+1} out of {cv}")
|
79 |
+
train_split, test_split = train_test_split
|
80 |
+
train_split_index = training_data[ERA_COL].isin(train_split)
|
81 |
+
test_split_index = training_data[ERA_COL].isin(test_split)
|
82 |
+
downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val]
|
83 |
+
|
84 |
+
# getting the per era correlation of each feature vs the primary target across the training split
|
85 |
+
print("getting feature correlations over time and identifying riskiest features")
|
86 |
+
all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
|
87 |
+
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
|
88 |
+
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
|
89 |
+
# there are probably more clever ways to do this
|
90 |
+
riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)
|
91 |
+
|
92 |
+
print(f"entering model training loop for split {split+1}")
|
93 |
+
for target in targets:
|
94 |
+
model_name = f"model_{target}"
|
95 |
+
print(f"model: {model_name}")
|
96 |
+
|
97 |
+
# train a model on the training split (and save it for future use)
|
98 |
+
split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
|
99 |
+
split_model = load_model(split_model_name)
|
100 |
+
if not split_model:
|
101 |
+
print(f"training model: {model_name}")
|
102 |
+
split_model = LGBMRegressor(**model_params)
|
103 |
+
split_model.fit(training_data.loc[downsampled_train_split_index, feature_cols],
|
104 |
+
training_data.loc[downsampled_train_split_index,
|
105 |
+
[target]])
|
106 |
+
save_model(split_model, split_model_name)
|
107 |
+
# now we can predict on the test part of the split
|
108 |
+
model_expected_features = split_model.booster_.feature_name()
|
109 |
+
if set(model_expected_features) != set(feature_cols):
|
110 |
+
print(f"New features are available! Might want to retrain model {split_model_name}.")
|
111 |
+
print(f"predicting {model_name}")
|
112 |
+
training_data.loc[test_split_index, f"preds_{model_name}"] = \
|
113 |
+
split_model.predict(training_data.loc[test_split_index, model_expected_features])
|
114 |
+
|
115 |
+
# do neutralization
|
116 |
+
print("doing neutralization to riskiest features")
|
117 |
+
training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
|
118 |
+
df=training_data.loc[test_split_index, :],
|
119 |
+
columns=[f"preds_{model_name}"],
|
120 |
+
neutralizers=riskiest_features_split,
|
121 |
+
proportion=1.0,
|
122 |
+
normalize=True,
|
123 |
+
era_col=ERA_COL)[f"preds_{model_name}"]
|
124 |
+
|
125 |
+
# remember that we made all of these different pred columns
|
126 |
+
pred_cols.add(f"preds_{model_name}")
|
127 |
+
pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
|
128 |
+
|
129 |
+
print("creating ensembles")
|
130 |
+
# ranking per era for all of our pred cols so we can combine safely on the same scales
|
131 |
+
training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
|
132 |
+
lambda d: d[list(pred_cols)].rank(pct=True))
|
133 |
+
# do ensembles
|
134 |
+
training_data["ensemble_neutral_riskiest_50"] = sum(
|
135 |
+
[training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
|
136 |
+
pct=True)
|
137 |
+
training_data["ensemble_not_neutral"] = sum(
|
138 |
+
[training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
|
139 |
+
training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
|
140 |
+
|
141 |
+
ensemble_cols.add("ensemble_neutral_riskiest_50")
|
142 |
+
ensemble_cols.add("ensemble_not_neutral")
|
143 |
+
ensemble_cols.add("ensemble_all")
|
144 |
+
|
145 |
+
""" Now get some stats and pick our favorite model"""
|
146 |
+
print("gathering validation metrics for out of sample training results")
|
147 |
+
all_model_cols = list(pred_cols) + list(ensemble_cols)
|
148 |
+
# use example_col preds_model_target as an estimates since no example preds provided for training
|
149 |
+
# fast_mode=True so that we skip some of the stats that are slower to calculate
|
150 |
+
training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
|
151 |
+
fast_mode=True, target_col=TARGET_COL)
|
152 |
+
print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())
|
153 |
+
|
154 |
+
# pick the model that has the highest correlation sharpe
|
155 |
+
best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
|
156 |
+
print(f"selecting model {best_pred_col} as our highest sharpe model in validation")
|
157 |
+
|
158 |
+
""" Now do a full train"""
|
159 |
+
print("entering full training section")
|
160 |
+
# getting the per era correlation of each feature vs the target across all of training data
|
161 |
+
print("getting feature correlations with target and identifying riskiest features")
|
162 |
+
all_feature_corrs = training_data.groupby(ERA_COL).apply(
|
163 |
+
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
|
164 |
+
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
|
165 |
+
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
|
166 |
+
|
167 |
+
for target in targets:
|
168 |
+
gc.collect()
|
169 |
+
model_name = f"model_{target}_downsample{downsample_full_train}"
|
170 |
+
model = load_model(model_name)
|
171 |
+
if not model:
|
172 |
+
print(f"training {model_name}")
|
173 |
+
model = LGBMRegressor(**model_params)
|
174 |
+
# train on all of train, predict on val, predict on tournament
|
175 |
+
model.fit(training_data.iloc[::downsample_full_train].loc[:, feature_cols],
|
176 |
+
training_data.iloc[::downsample_full_train][target])
|
177 |
+
save_model(model, model_name)
|
178 |
+
gc.collect()
|
179 |
+
|
180 |
+
model_config["feature_cols"] = feature_cols
|
181 |
+
model_config["targets"] = targets
|
182 |
+
model_config["best_pred_col"] = best_pred_col
|
183 |
+
model_config["riskiest_features"] = riskiest_features
|
184 |
+
print(f"saving model config for {model_config_name}")
|
185 |
+
save_model_config(model_config, model_config_name)
|
186 |
+
else:
|
187 |
+
# load model config from previous model selection loop
|
188 |
+
print(f"loading model config for {model_config_name}")
|
189 |
+
model_config = load_model_config(model_config_name)
|
190 |
+
feature_cols = model_config["feature_cols"]
|
191 |
+
targets = model_config["targets"]
|
192 |
+
best_pred_col = model_config["best_pred_col"]
|
193 |
+
riskiest_features = model_config["riskiest_features"]
|
194 |
+
|
195 |
+
|
196 |
+
""" Things that we always do even if we've already trained """
|
197 |
+
gc.collect()
|
198 |
+
|
199 |
+
print("reading tournament_data")
|
200 |
+
live_data = pd.read_parquet('v4/live.parquet')
|
201 |
+
print("reading validation_data")
|
202 |
+
validation_data = pd.read_parquet('v4/validation.parquet')
|
203 |
+
print("reading example_predictions")
|
204 |
+
example_preds = pd.read_parquet('v4/live_example_preds.parquet')
|
205 |
+
print("reading example_validaton_predictions")
|
206 |
+
validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
|
207 |
+
# set the example predictions
|
208 |
+
validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]
|
209 |
+
|
210 |
+
# check for nans and fill nans
|
211 |
+
print("checking for nans in the tournament data")
|
212 |
+
if live_data.loc[:, feature_cols].isna().sum().sum():
|
213 |
+
cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
|
214 |
+
total_rows = len(live_data)
|
215 |
+
print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
|
216 |
+
print(f"out of {total_rows} total rows")
|
217 |
+
print(f"filling nans with 0.5")
|
218 |
+
live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)
|
219 |
+
|
220 |
+
else:
|
221 |
+
print("No nans in the features this week!")
|
222 |
+
|
223 |
+
|
224 |
+
pred_cols = set()
|
225 |
+
ensemble_cols = set()
|
226 |
+
for target in targets:
|
227 |
+
gc.collect()
|
228 |
+
model_name = f"model_{target}_downsample{downsample_full_train}"
|
229 |
+
print(f"loading {model_name}")
|
230 |
+
model = load_model(model_name)
|
231 |
+
if not model:
|
232 |
+
raise ValueError(f"{model_name} is not trained yet!")
|
233 |
+
|
234 |
+
model_expected_features = model.booster_.feature_name()
|
235 |
+
if set(model_expected_features) != set(feature_cols):
|
236 |
+
print(f"New features are available! Might want to retrain model {model_name}.")
|
237 |
+
print(f"predicting tournament and validation for {model_name}")
|
238 |
+
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
|
239 |
+
live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])
|
240 |
+
|
241 |
+
# do different neutralizations
|
242 |
+
# neutralize our predictions to the riskiest features only
|
243 |
+
print("neutralizing to riskiest_50 for validation and tournament")
|
244 |
+
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
|
245 |
+
columns=[f"preds_{model_name}"],
|
246 |
+
neutralizers=riskiest_features,
|
247 |
+
proportion=1.0,
|
248 |
+
normalize=True,
|
249 |
+
era_col=ERA_COL)[f"preds_{model_name}"]
|
250 |
+
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
|
251 |
+
columns=[f"preds_{model_name}"],
|
252 |
+
neutralizers=riskiest_features,
|
253 |
+
proportion=1.0,
|
254 |
+
normalize=True,
|
255 |
+
era_col=ERA_COL)[f"preds_{model_name}"]
|
256 |
+
|
257 |
+
pred_cols.add(f"preds_{model_name}")
|
258 |
+
pred_cols.add(f"preds_{model_name}_neutral_riskiest_50")
|
259 |
+
|
260 |
+
|
261 |
+
# rank per era for each prediction column so that we can combine safely
|
262 |
+
validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
|
263 |
+
live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
|
264 |
+
# make ensembles for val and tournament
|
265 |
+
print('creating ensembles for tournament and validation')
|
266 |
+
validation_data["ensemble_neutral_riskiest_50"] = sum(
|
267 |
+
[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
|
268 |
+
pct=True)
|
269 |
+
live_data["ensemble_neutral_riskiest_50"] = sum(
|
270 |
+
[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
|
271 |
+
pct=True)
|
272 |
+
ensemble_cols.add("ensemble_neutral_riskiest_50")
|
273 |
+
|
274 |
+
validation_data["ensemble_not_neutral"] = sum(
|
275 |
+
[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
|
276 |
+
live_data["ensemble_not_neutral"] = sum(
|
277 |
+
[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
|
278 |
+
ensemble_cols.add("ensemble_not_neutral")
|
279 |
+
|
280 |
+
validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
|
281 |
+
live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
|
282 |
+
|
283 |
+
ensemble_cols.add("ensemble_all")
|
284 |
+
|
285 |
+
gc.collect()
|
286 |
+
print("getting final validation stats")
|
287 |
+
# get our final validation stats for our chosen model
|
288 |
+
validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
|
289 |
+
fast_mode=False, target_col=TARGET_COL)
|
290 |
+
print(validation_stats.to_markdown())
|
291 |
+
|
292 |
+
# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
|
293 |
+
validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
|
294 |
+
live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
|
295 |
+
save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
|
296 |
+
save_prediction(live_data["prediction"], f"live_data_{current_round}")
|
utils.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import scipy
|
4 |
+
from halo import Halo
|
5 |
+
from pathlib import Path
|
6 |
+
import json
|
7 |
+
from scipy.stats import skew
|
8 |
+
|
9 |
+
ERA_COL = "era"
|
10 |
+
TARGET_COL = "target_nomi_v4_20"
|
11 |
+
DATA_TYPE_COL = "data_type"
|
12 |
+
EXAMPLE_PREDS_COL = "example_preds"
|
13 |
+
|
14 |
+
spinner = Halo(text='', spinner='dots')
|
15 |
+
|
16 |
+
MODEL_FOLDER = "models"
|
17 |
+
MODEL_CONFIGS_FOLDER = "model_configs"
|
18 |
+
PREDICTION_FILES_FOLDER = "prediction_files"
|
19 |
+
|
20 |
+
|
21 |
+
def save_prediction(df, name):
|
22 |
+
try:
|
23 |
+
Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True)
|
24 |
+
except Exception as ex:
|
25 |
+
pass
|
26 |
+
df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True)
|
27 |
+
|
28 |
+
|
29 |
+
def save_model(model, name):
|
30 |
+
try:
|
31 |
+
Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
|
32 |
+
except Exception as ex:
|
33 |
+
pass
|
34 |
+
pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")
|
35 |
+
|
36 |
+
|
37 |
+
def load_model(name):
|
38 |
+
path = Path(f"{MODEL_FOLDER}/{name}.pkl")
|
39 |
+
if path.is_file():
|
40 |
+
model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
|
41 |
+
else:
|
42 |
+
model = False
|
43 |
+
return model
|
44 |
+
|
45 |
+
|
46 |
+
def save_model_config(model_config, model_name):
|
47 |
+
try:
|
48 |
+
Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True)
|
49 |
+
except Exception as ex:
|
50 |
+
pass
|
51 |
+
with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", 'w') as fp:
|
52 |
+
json.dump(model_config, fp)
|
53 |
+
|
54 |
+
|
55 |
+
def load_model_config(model_name):
|
56 |
+
path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json"
|
57 |
+
path = Path(path_str)
|
58 |
+
if path.is_file():
|
59 |
+
with open(path_str, 'r') as fp:
|
60 |
+
model_config = json.load(fp)
|
61 |
+
else:
|
62 |
+
model_config = False
|
63 |
+
return model_config
|
64 |
+
|
65 |
+
|
66 |
+
def get_biggest_change_features(corrs, n):
|
67 |
+
all_eras = corrs.index.sort_values()
|
68 |
+
h1_eras = all_eras[:len(all_eras) // 2]
|
69 |
+
h2_eras = all_eras[len(all_eras) // 2:]
|
70 |
+
|
71 |
+
h1_corr_means = corrs.loc[h1_eras, :].mean()
|
72 |
+
h2_corr_means = corrs.loc[h2_eras, :].mean()
|
73 |
+
|
74 |
+
corr_diffs = h2_corr_means - h1_corr_means
|
75 |
+
worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
|
76 |
+
return worst_n
|
77 |
+
|
78 |
+
|
79 |
+
def get_time_series_cross_val_splits(data, cv=3, embargo=12):
|
80 |
+
all_train_eras = data[ERA_COL].unique()
|
81 |
+
len_split = len(all_train_eras) // cv
|
82 |
+
test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)]
|
83 |
+
# fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv
|
84 |
+
remainder = len(all_train_eras) % cv
|
85 |
+
if remainder != 0:
|
86 |
+
test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:])
|
87 |
+
|
88 |
+
train_splits = []
|
89 |
+
for test_split in test_splits:
|
90 |
+
test_split_max = int(np.max(test_split))
|
91 |
+
test_split_min = int(np.min(test_split))
|
92 |
+
# get all of the eras that aren't in the test split
|
93 |
+
train_split_not_embargoed = [e for e in all_train_eras if not (test_split_min <= int(e) <= test_split_max)]
|
94 |
+
# embargo the train split so we have no leakage.
|
95 |
+
# one era is length 5, so we need to embargo by target_length/5 eras.
|
96 |
+
# To be consistent for all targets, let's embargo everything by 60/5 == 12 eras.
|
97 |
+
train_split = [e for e in train_split_not_embargoed if
|
98 |
+
abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo]
|
99 |
+
train_splits.append(train_split)
|
100 |
+
|
101 |
+
# convenient way to iterate over train and test splits
|
102 |
+
train_test_zip = zip(train_splits, test_splits)
|
103 |
+
return train_test_zip
|
104 |
+
|
105 |
+
|
106 |
+
def neutralize(df,
|
107 |
+
columns,
|
108 |
+
neutralizers=None,
|
109 |
+
proportion=1.0,
|
110 |
+
normalize=True,
|
111 |
+
era_col="era"):
|
112 |
+
if neutralizers is None:
|
113 |
+
neutralizers = []
|
114 |
+
unique_eras = df[era_col].unique()
|
115 |
+
computed = []
|
116 |
+
for u in unique_eras:
|
117 |
+
df_era = df[df[era_col] == u]
|
118 |
+
scores = df_era[columns].values
|
119 |
+
if normalize:
|
120 |
+
scores2 = []
|
121 |
+
for x in scores.T:
|
122 |
+
x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
|
123 |
+
x = scipy.stats.norm.ppf(x)
|
124 |
+
scores2.append(x)
|
125 |
+
scores = np.array(scores2).T
|
126 |
+
exposures = df_era[neutralizers].values
|
127 |
+
|
128 |
+
scores -= proportion * exposures.dot(
|
129 |
+
np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32)))
|
130 |
+
|
131 |
+
scores /= scores.std(ddof=0)
|
132 |
+
|
133 |
+
computed.append(scores)
|
134 |
+
|
135 |
+
return pd.DataFrame(np.concatenate(computed),
|
136 |
+
columns=columns,
|
137 |
+
index=df.index)
|
138 |
+
|
139 |
+
|
140 |
+
def neutralize_series(series, by, proportion=1.0):
|
141 |
+
scores = series.values.reshape(-1, 1)
|
142 |
+
exposures = by.values.reshape(-1, 1)
|
143 |
+
|
144 |
+
# this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
|
145 |
+
exposures = np.hstack(
|
146 |
+
(exposures,
|
147 |
+
np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))
|
148 |
+
|
149 |
+
correction = proportion * (exposures.dot(
|
150 |
+
np.linalg.lstsq(exposures, scores, rcond=None)[0]))
|
151 |
+
corrected_scores = scores - correction
|
152 |
+
neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
|
153 |
+
return neutralized
|
154 |
+
|
155 |
+
|
156 |
+
def unif(df):
|
157 |
+
x = (df.rank(method="first") - 0.5) / len(df)
|
158 |
+
return pd.Series(x, index=df.index)
|
159 |
+
|
160 |
+
|
161 |
+
def get_feature_neutral_mean(df, prediction_col, target_col, features_for_neutralization=None):
|
162 |
+
if features_for_neutralization is None:
|
163 |
+
features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
|
164 |
+
df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col],
|
165 |
+
features_for_neutralization)[prediction_col]
|
166 |
+
scores = df.groupby("era").apply(
|
167 |
+
lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean()
|
168 |
+
return np.mean(scores)
|
169 |
+
|
170 |
+
def get_feature_neutral_mean_tb_era(df, prediction_col, target_col, tb, features_for_neutralization=None):
|
171 |
+
if features_for_neutralization is None:
|
172 |
+
features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
|
173 |
+
temp_df = df.reset_index(drop=True).copy() # Reset index due to use of argsort later
|
174 |
+
temp_df.loc[:, "neutral_sub"] = neutralize(temp_df, [prediction_col],
|
175 |
+
features_for_neutralization)[prediction_col]
|
176 |
+
temp_df_argsort = temp_df.loc[:, 'neutral_sub'].argsort()
|
177 |
+
temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb],
|
178 |
+
temp_df_argsort.iloc[-tb:]])
|
179 |
+
temp_df_tb = temp_df.loc[temp_df_tb_idx]
|
180 |
+
tb_fnc = unif(temp_df_tb['neutral_sub']).corr(temp_df_tb[target_col])
|
181 |
+
return tb_fnc
|
182 |
+
|
183 |
+
|
184 |
+
def fast_score_by_date(df, columns, target, tb=None, era_col="era"):
|
185 |
+
unique_eras = df[era_col].unique()
|
186 |
+
computed = []
|
187 |
+
for u in unique_eras:
|
188 |
+
df_era = df[df[era_col] == u]
|
189 |
+
era_pred = np.float64(df_era[columns].values.T)
|
190 |
+
era_target = np.float64(df_era[target].values.T)
|
191 |
+
|
192 |
+
if tb is None:
|
193 |
+
ccs = np.corrcoef(era_target, era_pred)[0, 1:]
|
194 |
+
else:
|
195 |
+
tbidx = np.argsort(era_pred, axis=1)
|
196 |
+
tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
|
197 |
+
ccs = [np.corrcoef(era_target[tmpidx], tmppred[tmpidx])[0, 1] for tmpidx, tmppred in zip(tbidx, era_pred)]
|
198 |
+
ccs = np.array(ccs)
|
199 |
+
|
200 |
+
computed.append(ccs)
|
201 |
+
|
202 |
+
return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique())
|
203 |
+
|
204 |
+
def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None):
|
205 |
+
if feature_cols is None:
|
206 |
+
feature_cols = [c for c in df.columns if c.startswith("feature")]
|
207 |
+
u = df.loc[:, feature_cols].corrwith(df[prediction_col])
|
208 |
+
e = df.loc[:, feature_cols].corrwith(df[example_col])
|
209 |
+
return (1 - (np.dot(u,e)/np.dot(e,e)))
|
210 |
+
|
211 |
+
def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False,
|
212 |
+
target_col=TARGET_COL, features_for_neutralization=None):
|
213 |
+
validation_stats = pd.DataFrame()
|
214 |
+
feature_cols = [c for c in validation_data if c.startswith("feature_")]
|
215 |
+
for pred_col in pred_cols:
|
216 |
+
# Check the per-era correlations on the validation set (out of sample)
|
217 |
+
validation_correlations = validation_data.groupby(ERA_COL).apply(
|
218 |
+
lambda d: unif(d[pred_col]).corr(d[target_col]))
|
219 |
+
|
220 |
+
mean = validation_correlations.mean()
|
221 |
+
std = validation_correlations.std(ddof=0)
|
222 |
+
sharpe = mean / std
|
223 |
+
|
224 |
+
validation_stats.loc["mean", pred_col] = mean
|
225 |
+
validation_stats.loc["std", pred_col] = std
|
226 |
+
validation_stats.loc["sharpe", pred_col] = sharpe
|
227 |
+
|
228 |
+
rolling_max = (validation_correlations + 1).cumprod().rolling(window=9000, # arbitrarily large
|
229 |
+
min_periods=1).max()
|
230 |
+
daily_value = (validation_correlations + 1).cumprod()
|
231 |
+
max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
|
232 |
+
validation_stats.loc["max_drawdown", pred_col] = max_drawdown
|
233 |
+
|
234 |
+
payout_scores = validation_correlations.clip(-0.25, 0.25)
|
235 |
+
payout_daily_value = (payout_scores + 1).cumprod()
|
236 |
+
|
237 |
+
apy = (
|
238 |
+
(
|
239 |
+
(payout_daily_value.dropna().iloc[-1])
|
240 |
+
** (1 / len(payout_scores))
|
241 |
+
)
|
242 |
+
** 49 # 52 weeks of compounding minus 3 for stake compounding lag
|
243 |
+
- 1
|
244 |
+
) * 100
|
245 |
+
|
246 |
+
validation_stats.loc["apy", pred_col] = apy
|
247 |
+
|
248 |
+
if not fast_mode:
|
249 |
+
# Check the feature exposure of your validation predictions
|
250 |
+
max_per_era = validation_data.groupby(ERA_COL).apply(
|
251 |
+
lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max())
|
252 |
+
max_feature_exposure = max_per_era.mean()
|
253 |
+
validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure
|
254 |
+
|
255 |
+
# Check feature neutral mean
|
256 |
+
feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col,
|
257 |
+
target_col, features_for_neutralization)
|
258 |
+
validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean
|
259 |
+
|
260 |
+
# Check TB200 feature neutral mean
|
261 |
+
tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(lambda df: \
|
262 |
+
get_feature_neutral_mean_tb_era(df, pred_col,
|
263 |
+
target_col, 200,
|
264 |
+
features_for_neutralization))
|
265 |
+
validation_stats.loc["tb200_feature_neutral_mean", pred_col] = tb200_feature_neutral_mean_era.mean()
|
266 |
+
|
267 |
+
# Check top and bottom 200 metrics (TB200)
|
268 |
+
tb200_validation_correlations = fast_score_by_date(
|
269 |
+
validation_data,
|
270 |
+
[pred_col],
|
271 |
+
target_col,
|
272 |
+
tb=200,
|
273 |
+
era_col=ERA_COL
|
274 |
+
)
|
275 |
+
|
276 |
+
tb200_mean = tb200_validation_correlations.mean()[pred_col]
|
277 |
+
tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col]
|
278 |
+
tb200_sharpe = tb200_mean / tb200_std
|
279 |
+
|
280 |
+
validation_stats.loc["tb200_mean", pred_col] = tb200_mean
|
281 |
+
validation_stats.loc["tb200_std", pred_col] = tb200_std
|
282 |
+
validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe
|
283 |
+
|
284 |
+
# MMC over validation
|
285 |
+
mmc_scores = []
|
286 |
+
corr_scores = []
|
287 |
+
for _, x in validation_data.groupby(ERA_COL):
|
288 |
+
series = neutralize_series(unif(x[pred_col]), (x[example_col]))
|
289 |
+
mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2))
|
290 |
+
corr_scores.append(unif(x[pred_col]).corr(x[target_col]))
|
291 |
+
|
292 |
+
val_mmc_mean = np.mean(mmc_scores)
|
293 |
+
val_mmc_std = np.std(mmc_scores)
|
294 |
+
corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
|
295 |
+
corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
|
296 |
+
|
297 |
+
validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean
|
298 |
+
validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe
|
299 |
+
|
300 |
+
# Check correlation with example predictions
|
301 |
+
per_era_corrs = validation_data.groupby(ERA_COL).apply(lambda d: unif(d[pred_col]).corr(unif(d[example_col])))
|
302 |
+
corr_with_example_preds = per_era_corrs.mean()
|
303 |
+
validation_stats.loc["corr_with_example_preds", pred_col] = corr_with_example_preds
|
304 |
+
|
305 |
+
#Check exposure dissimilarity per era
|
306 |
+
tdf = validation_data.groupby(ERA_COL).apply(lambda df: \
|
307 |
+
exposure_dissimilarity_per_era(df, pred_col,
|
308 |
+
example_col, feature_cols))
|
309 |
+
validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean()
|
310 |
+
|
311 |
+
# .transpose so that stats are columns and the model_name is the row
|
312 |
+
return validation_stats.transpose()
|