Runtime error
Runtime error
add python files from official repo
Browse files- example-scripts +0 -1
- +296 -0
- +312 -0
@@ -1 +0,0 @@
1 |
Subproject commit 838bfd1788feaf40362d6bedb3e4683832a9dbb1
@@ -0,0 +1,296 @@
1 |
import pandas as pd
2 |
from lightgbm import LGBMRegressor
3 |
import gc
4 |
from numerapi import NumerAPI
5 |
from pathlib import Path
6 |
from utils import (
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
EXAMPLE_PREDS_COL = "example_preds"
21 |
ERA_COL = "era"
22 |
# params we'll use to train all of our models.
23 |
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
24 |
model_params = {"n_estimators": 2000,
25 |
"learning_rate": 0.01,
26 |
"max_depth": 5,
27 |
"num_leaves": 2 ** 5,
28 |
"colsample_bytree": 0.1}
29 |
30 |
# the amount of downsampling we'll use to speed up cross validation and full train.
31 |
# a value of 1 means no downsampling
32 |
# a value of 10 means use every 10th row
33 |
downsample_cross_val = 20
34 |
downsample_full_train = 2
35 |
36 |
# if model_selection_loop=True get OOS performance for training_data
37 |
# and use that to select best model
38 |
# if model_selection_loop=False, just predict on tournament data using existing models and model config
39 |
model_selection_loop = True
40 |
model_config_name = "advanced_example_model"
41 |
42 |
napi = NumerAPI()
43 |
44 |
current_round = napi.get_current_round()
45 |
46 |
Path("./v4").mkdir(parents=False, exist_ok=True)
47 |
48 |
49 |
50 |
51 |
print("Entering model selection loop. This may take awhile.")
52 |
if model_selection_loop:
53 |
model_config = {}
54 |
print('reading training_data')
55 |
training_data = pd.read_parquet('v4/train.parquet')
56 |
57 |
# keep track of some prediction columns
58 |
ensemble_cols = set()
59 |
pred_cols = set()
60 |
61 |
# pick some targets to use
62 |
possible_targets = [c for c in training_data.columns if c.startswith("target_")]
63 |
# randomly pick a handful of targets
64 |
# this can be vastly improved
65 |
targets = ["target", "target_nomi_v4_60", "target_jerome_v4_20"]
66 |
67 |
# all the possible features to train on
68 |
feature_cols = [c for c in training_data if c.startswith("feature_")]
69 |
70 |
""" do cross val to get out of sample training preds"""
71 |
cv = 3
72 |
train_test_zip = get_time_series_cross_val_splits(training_data, cv=cv, embargo=12)
73 |
# get out of sample training preds via embargoed time series cross validation
74 |
# optionally downsample training data to speed up this section.
75 |
print("entering time series cross validation loop")
76 |
for split, train_test_split in enumerate(train_test_zip):
77 |
78 |
print(f"doing split {split+1} out of {cv}")
79 |
train_split, test_split = train_test_split
80 |
train_split_index = training_data[ERA_COL].isin(train_split)
81 |
test_split_index = training_data[ERA_COL].isin(test_split)
82 |
downsampled_train_split_index = train_split_index[train_split_index].index[::downsample_cross_val]
83 |
84 |
# getting the per era correlation of each feature vs the primary target across the training split
85 |
print("getting feature correlations over time and identifying riskiest features")
86 |
all_feature_corrs_split = training_data.loc[downsampled_train_split_index, :].groupby(ERA_COL).apply(
87 |
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
88 |
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
89 |
# there are probably more clever ways to do this
90 |
riskiest_features_split = get_biggest_change_features(all_feature_corrs_split, 50)
91 |
92 |
print(f"entering model training loop for split {split+1}")
93 |
for target in targets:
94 |
model_name = f"model_{target}"
95 |
print(f"model: {model_name}")
96 |
97 |
# train a model on the training split (and save it for future use)
98 |
split_model_name = f"model_{target}_split{split+1}cv{cv}downsample{downsample_cross_val}"
99 |
split_model = load_model(split_model_name)
100 |
if not split_model:
101 |
print(f"training model: {model_name}")
102 |
split_model = LGBMRegressor(**model_params)
103 |
+[downsampled_train_split_index, feature_cols],
104 |
105 |
106 |
save_model(split_model, split_model_name)
107 |
# now we can predict on the test part of the split
108 |
model_expected_features = split_model.booster_.feature_name()
109 |
if set(model_expected_features) != set(feature_cols):
110 |
print(f"New features are available! Might want to retrain model {split_model_name}.")
111 |
print(f"predicting {model_name}")
112 |
training_data.loc[test_split_index, f"preds_{model_name}"] = \
113 |
split_model.predict(training_data.loc[test_split_index, model_expected_features])
114 |
115 |
# do neutralization
116 |
print("doing neutralization to riskiest features")
117 |
training_data.loc[test_split_index, f"preds_{model_name}_neutral_riskiest_50"] = neutralize(
118 |
df=training_data.loc[test_split_index, :],
119 |
120 |
121 |
122 |
123 |
124 |
125 |
# remember that we made all of these different pred columns
126 |
127 |
128 |
129 |
print("creating ensembles")
130 |
# ranking per era for all of our pred cols so we can combine safely on the same scales
131 |
training_data[list(pred_cols)] = training_data.groupby(ERA_COL).apply(
132 |
lambda d: d[list(pred_cols)].rank(pct=True))
133 |
# do ensembles
134 |
training_data["ensemble_neutral_riskiest_50"] = sum(
135 |
[training_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
136 |
137 |
training_data["ensemble_not_neutral"] = sum(
138 |
[training_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
139 |
training_data["ensemble_all"] = sum([training_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
140 |
141 |
142 |
143 |
144 |
145 |
""" Now get some stats and pick our favorite model"""
146 |
print("gathering validation metrics for out of sample training results")
147 |
all_model_cols = list(pred_cols) + list(ensemble_cols)
148 |
# use example_col preds_model_target as an estimates since no example preds provided for training
149 |
# fast_mode=True so that we skip some of the stats that are slower to calculate
150 |
training_stats = validation_metrics(training_data, all_model_cols, example_col="preds_model_target",
151 |
fast_mode=True, target_col=TARGET_COL)
152 |
print(training_stats[["mean", "sharpe"]].sort_values(by="sharpe", ascending=False).to_markdown())
153 |
154 |
# pick the model that has the highest correlation sharpe
155 |
best_pred_col = training_stats.sort_values(by="sharpe", ascending=False).head(1).index[0]
156 |
print(f"selecting model {best_pred_col} as our highest sharpe model in validation")
157 |
158 |
""" Now do a full train"""
159 |
print("entering full training section")
160 |
# getting the per era correlation of each feature vs the target across all of training data
161 |
print("getting feature correlations with target and identifying riskiest features")
162 |
all_feature_corrs = training_data.groupby(ERA_COL).apply(
163 |
lambda d: d[feature_cols].corrwith(d[TARGET_COL]))
164 |
# find the riskiest features by comparing their correlation vs the target in half 1 and half 2 of training data
165 |
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
166 |
167 |
for target in targets:
168 |
169 |
model_name = f"model_{target}_downsample{downsample_full_train}"
170 |
model = load_model(model_name)
171 |
if not model:
172 |
print(f"training {model_name}")
173 |
model = LGBMRegressor(**model_params)
174 |
# train on all of train, predict on val, predict on tournament
175 |
+[::downsample_full_train].loc[:, feature_cols],
176 |
177 |
save_model(model, model_name)
178 |
179 |
180 |
model_config["feature_cols"] = feature_cols
181 |
model_config["targets"] = targets
182 |
model_config["best_pred_col"] = best_pred_col
183 |
model_config["riskiest_features"] = riskiest_features
184 |
print(f"saving model config for {model_config_name}")
185 |
save_model_config(model_config, model_config_name)
186 |
187 |
# load model config from previous model selection loop
188 |
print(f"loading model config for {model_config_name}")
189 |
model_config = load_model_config(model_config_name)
190 |
feature_cols = model_config["feature_cols"]
191 |
targets = model_config["targets"]
192 |
best_pred_col = model_config["best_pred_col"]
193 |
riskiest_features = model_config["riskiest_features"]
194 |
195 |
196 |
""" Things that we always do even if we've already trained """
197 |
198 |
199 |
print("reading tournament_data")
200 |
live_data = pd.read_parquet('v4/live.parquet')
201 |
print("reading validation_data")
202 |
validation_data = pd.read_parquet('v4/validation.parquet')
203 |
print("reading example_predictions")
204 |
example_preds = pd.read_parquet('v4/live_example_preds.parquet')
205 |
print("reading example_validaton_predictions")
206 |
validation_example_preds = pd.read_parquet('v4/validation_example_preds.parquet')
207 |
# set the example predictions
208 |
validation_data[EXAMPLE_PREDS_COL] = validation_example_preds["prediction"]
209 |
210 |
# check for nans and fill nans
211 |
print("checking for nans in the tournament data")
212 |
if live_data.loc[:, feature_cols].isna().sum().sum():
213 |
cols_w_nan = live_data.loc[:, feature_cols].isna().sum()
214 |
total_rows = len(live_data)
215 |
print(f"Number of nans per column this week: {cols_w_nan[cols_w_nan > 0]}")
216 |
print(f"out of {total_rows} total rows")
217 |
print(f"filling nans with 0.5")
218 |
live_data.loc[:, feature_cols] = live_data.loc[:, feature_cols].fillna(0.5)
219 |
220 |
221 |
print("No nans in the features this week!")
222 |
223 |
224 |
pred_cols = set()
225 |
ensemble_cols = set()
226 |
for target in targets:
227 |
228 |
model_name = f"model_{target}_downsample{downsample_full_train}"
229 |
print(f"loading {model_name}")
230 |
model = load_model(model_name)
231 |
if not model:
232 |
raise ValueError(f"{model_name} is not trained yet!")
233 |
234 |
model_expected_features = model.booster_.feature_name()
235 |
if set(model_expected_features) != set(feature_cols):
236 |
print(f"New features are available! Might want to retrain model {model_name}.")
237 |
print(f"predicting tournament and validation for {model_name}")
238 |
validation_data.loc[:, f"preds_{model_name}"] = model.predict(validation_data.loc[:, model_expected_features])
239 |
live_data.loc[:, f"preds_{model_name}"] = model.predict(live_data.loc[:, model_expected_features])
240 |
241 |
# do different neutralizations
242 |
# neutralize our predictions to the riskiest features only
243 |
print("neutralizing to riskiest_50 for validation and tournament")
244 |
validation_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=validation_data,
245 |
246 |
247 |
248 |
249 |
250 |
live_data[f"preds_{model_name}_neutral_riskiest_50"] = neutralize(df=live_data,
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
# rank per era for each prediction column so that we can combine safely
262 |
validation_data[list(pred_cols)] = validation_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
263 |
live_data[list(pred_cols)] = live_data.groupby(ERA_COL).apply(lambda d: d[list(pred_cols)].rank(pct=True))
264 |
# make ensembles for val and tournament
265 |
print('creating ensembles for tournament and validation')
266 |
validation_data["ensemble_neutral_riskiest_50"] = sum(
267 |
[validation_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
268 |
269 |
live_data["ensemble_neutral_riskiest_50"] = sum(
270 |
[live_data[pred_col] for pred_col in pred_cols if pred_col.endswith("neutral_riskiest_50")]).rank(
271 |
272 |
273 |
274 |
validation_data["ensemble_not_neutral"] = sum(
275 |
[validation_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
276 |
live_data["ensemble_not_neutral"] = sum(
277 |
[live_data[pred_col] for pred_col in pred_cols if "neutral" not in pred_col]).rank(pct=True)
278 |
279 |
280 |
validation_data["ensemble_all"] = sum([validation_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
281 |
live_data["ensemble_all"] = sum([live_data[pred_col] for pred_col in pred_cols]).rank(pct=True)
282 |
283 |
284 |
285 |
286 |
print("getting final validation stats")
287 |
# get our final validation stats for our chosen model
288 |
validation_stats = validation_metrics(validation_data, list(pred_cols)+list(ensemble_cols), example_col=EXAMPLE_PREDS_COL,
289 |
fast_mode=False, target_col=TARGET_COL)
290 |
291 |
292 |
# rename best model to prediction and rank from 0 to 1 to meet diagnostic/submission file requirements
293 |
validation_data["prediction"] = validation_data[best_pred_col].rank(pct=True)
294 |
live_data["prediction"] = live_data[best_pred_col].rank(pct=True)
295 |
save_prediction(validation_data["prediction"], f"validation_predictions_{current_round}")
296 |
save_prediction(live_data["prediction"], f"live_data_{current_round}")
@@ -0,0 +1,312 @@
1 |
import numpy as np
2 |
import pandas as pd
3 |
import scipy
4 |
from halo import Halo
5 |
from pathlib import Path
6 |
import json
7 |
from scipy.stats import skew
8 |
9 |
ERA_COL = "era"
10 |
TARGET_COL = "target_nomi_v4_20"
11 |
DATA_TYPE_COL = "data_type"
12 |
EXAMPLE_PREDS_COL = "example_preds"
13 |
14 |
spinner = Halo(text='', spinner='dots')
15 |
16 |
MODEL_FOLDER = "models"
17 |
MODEL_CONFIGS_FOLDER = "model_configs"
18 |
PREDICTION_FILES_FOLDER = "prediction_files"
19 |
20 |
21 |
def save_prediction(df, name):
22 |
23 |
Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True)
24 |
except Exception as ex:
25 |
26 |
df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True)
27 |
28 |
29 |
def save_model(model, name):
30 |
31 |
Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
32 |
except Exception as ex:
33 |
34 |
pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")
35 |
36 |
37 |
def load_model(name):
38 |
path = Path(f"{MODEL_FOLDER}/{name}.pkl")
39 |
if path.is_file():
40 |
model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
41 |
42 |
model = False
43 |
return model
44 |
45 |
46 |
def save_model_config(model_config, model_name):
47 |
48 |
Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True)
49 |
except Exception as ex:
50 |
51 |
with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", 'w') as fp:
52 |
json.dump(model_config, fp)
53 |
54 |
55 |
def load_model_config(model_name):
56 |
path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json"
57 |
path = Path(path_str)
58 |
if path.is_file():
59 |
with open(path_str, 'r') as fp:
60 |
model_config = json.load(fp)
61 |
62 |
model_config = False
63 |
return model_config
64 |
65 |
66 |
def get_biggest_change_features(corrs, n):
67 |
all_eras = corrs.index.sort_values()
68 |
h1_eras = all_eras[:len(all_eras) // 2]
69 |
h2_eras = all_eras[len(all_eras) // 2:]
70 |
71 |
h1_corr_means = corrs.loc[h1_eras, :].mean()
72 |
h2_corr_means = corrs.loc[h2_eras, :].mean()
73 |
74 |
corr_diffs = h2_corr_means - h1_corr_means
75 |
worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
76 |
return worst_n
77 |
78 |
79 |
def get_time_series_cross_val_splits(data, cv=3, embargo=12):
80 |
all_train_eras = data[ERA_COL].unique()
81 |
len_split = len(all_train_eras) // cv
82 |
test_splits = [all_train_eras[i * len_split:(i + 1) * len_split] for i in range(cv)]
83 |
# fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv
84 |
remainder = len(all_train_eras) % cv
85 |
if remainder != 0:
86 |
test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:])
87 |
88 |
train_splits = []
89 |
for test_split in test_splits:
90 |
test_split_max = int(np.max(test_split))
91 |
test_split_min = int(np.min(test_split))
92 |
# get all of the eras that aren't in the test split
93 |
train_split_not_embargoed = [e for e in all_train_eras if not (test_split_min <= int(e) <= test_split_max)]
94 |
# embargo the train split so we have no leakage.
95 |
# one era is length 5, so we need to embargo by target_length/5 eras.
96 |
# To be consistent for all targets, let's embargo everything by 60/5 == 12 eras.
97 |
train_split = [e for e in train_split_not_embargoed if
98 |
abs(int(e) - test_split_max) > embargo and abs(int(e) - test_split_min) > embargo]
99 |
100 |
101 |
# convenient way to iterate over train and test splits
102 |
train_test_zip = zip(train_splits, test_splits)
103 |
return train_test_zip
104 |
105 |
106 |
def neutralize(df,
107 |
108 |
109 |
110 |
111 |
112 |
if neutralizers is None:
113 |
neutralizers = []
114 |
unique_eras = df[era_col].unique()
115 |
computed = []
116 |
for u in unique_eras:
117 |
df_era = df[df[era_col] == u]
118 |
scores = df_era[columns].values
119 |
if normalize:
120 |
scores2 = []
121 |
for x in scores.T:
122 |
x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
123 |
x = scipy.stats.norm.ppf(x)
124 |
125 |
scores = np.array(scores2).T
126 |
exposures = df_era[neutralizers].values
127 |
128 |
scores -= proportion *
129 |
np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(scores.astype(np.float32)))
130 |
131 |
scores /= scores.std(ddof=0)
132 |
133 |
134 |
135 |
return pd.DataFrame(np.concatenate(computed),
136 |
137 |
138 |
139 |
140 |
def neutralize_series(series, by, proportion=1.0):
141 |
scores = series.values.reshape(-1, 1)
142 |
exposures = by.values.reshape(-1, 1)
143 |
144 |
# this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
145 |
exposures = np.hstack(
146 |
147 |
np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))
148 |
149 |
correction = proportion * (
150 |
np.linalg.lstsq(exposures, scores, rcond=None)[0]))
151 |
corrected_scores = scores - correction
152 |
neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
153 |
return neutralized
154 |
155 |
156 |
def unif(df):
157 |
x = (df.rank(method="first") - 0.5) / len(df)
158 |
return pd.Series(x, index=df.index)
159 |
160 |
161 |
def get_feature_neutral_mean(df, prediction_col, target_col, features_for_neutralization=None):
162 |
if features_for_neutralization is None:
163 |
features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
164 |
df.loc[:, "neutral_sub"] = neutralize(df, [prediction_col],
165 |
166 |
scores = df.groupby("era").apply(
167 |
lambda x: (unif(x["neutral_sub"]).corr(x[target_col]))).mean()
168 |
return np.mean(scores)
169 |
170 |
def get_feature_neutral_mean_tb_era(df, prediction_col, target_col, tb, features_for_neutralization=None):
171 |
if features_for_neutralization is None:
172 |
features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
173 |
temp_df = df.reset_index(drop=True).copy() # Reset index due to use of argsort later
174 |
temp_df.loc[:, "neutral_sub"] = neutralize(temp_df, [prediction_col],
175 |
176 |
temp_df_argsort = temp_df.loc[:, 'neutral_sub'].argsort()
177 |
temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb],
178 |
179 |
temp_df_tb = temp_df.loc[temp_df_tb_idx]
180 |
tb_fnc = unif(temp_df_tb['neutral_sub']).corr(temp_df_tb[target_col])
181 |
return tb_fnc
182 |
183 |
184 |
def fast_score_by_date(df, columns, target, tb=None, era_col="era"):
185 |
unique_eras = df[era_col].unique()
186 |
computed = []
187 |
for u in unique_eras:
188 |
df_era = df[df[era_col] == u]
189 |
era_pred = np.float64(df_era[columns].values.T)
190 |
era_target = np.float64(df_era[target].values.T)
191 |
192 |
if tb is None:
193 |
ccs = np.corrcoef(era_target, era_pred)[0, 1:]
194 |
195 |
tbidx = np.argsort(era_pred, axis=1)
196 |
tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
197 |
ccs = [np.corrcoef(era_target[tmpidx], tmppred[tmpidx])[0, 1] for tmpidx, tmppred in zip(tbidx, era_pred)]
198 |
ccs = np.array(ccs)
199 |
200 |
201 |
202 |
return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique())
203 |
204 |
def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None):
205 |
if feature_cols is None:
206 |
feature_cols = [c for c in df.columns if c.startswith("feature")]
207 |
u = df.loc[:, feature_cols].corrwith(df[prediction_col])
208 |
e = df.loc[:, feature_cols].corrwith(df[example_col])
209 |
return (1 - (,e)/,e)))
210 |
211 |
def validation_metrics(validation_data, pred_cols, example_col, fast_mode=False,
212 |
target_col=TARGET_COL, features_for_neutralization=None):
213 |
validation_stats = pd.DataFrame()
214 |
feature_cols = [c for c in validation_data if c.startswith("feature_")]
215 |
for pred_col in pred_cols:
216 |
# Check the per-era correlations on the validation set (out of sample)
217 |
validation_correlations = validation_data.groupby(ERA_COL).apply(
218 |
lambda d: unif(d[pred_col]).corr(d[target_col]))
219 |
220 |
mean = validation_correlations.mean()
221 |
std = validation_correlations.std(ddof=0)
222 |
sharpe = mean / std
223 |
224 |
validation_stats.loc["mean", pred_col] = mean
225 |
validation_stats.loc["std", pred_col] = std
226 |
validation_stats.loc["sharpe", pred_col] = sharpe
227 |
228 |
rolling_max = (validation_correlations + 1).cumprod().rolling(window=9000, # arbitrarily large
229 |
230 |
daily_value = (validation_correlations + 1).cumprod()
231 |
max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
232 |
validation_stats.loc["max_drawdown", pred_col] = max_drawdown
233 |
234 |
payout_scores = validation_correlations.clip(-0.25, 0.25)
235 |
payout_daily_value = (payout_scores + 1).cumprod()
236 |
237 |
apy = (
238 |
239 |
240 |
** (1 / len(payout_scores))
241 |
242 |
** 49 # 52 weeks of compounding minus 3 for stake compounding lag
243 |
- 1
244 |
) * 100
245 |
246 |
validation_stats.loc["apy", pred_col] = apy
247 |
248 |
if not fast_mode:
249 |
# Check the feature exposure of your validation predictions
250 |
max_per_era = validation_data.groupby(ERA_COL).apply(
251 |
lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max())
252 |
max_feature_exposure = max_per_era.mean()
253 |
validation_stats.loc["max_feature_exposure", pred_col] = max_feature_exposure
254 |
255 |
# Check feature neutral mean
256 |
feature_neutral_mean = get_feature_neutral_mean(validation_data, pred_col,
257 |
target_col, features_for_neutralization)
258 |
validation_stats.loc["feature_neutral_mean", pred_col] = feature_neutral_mean
259 |
260 |
# Check TB200 feature neutral mean
261 |
tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(lambda df: \
262 |
get_feature_neutral_mean_tb_era(df, pred_col,
263 |
target_col, 200,
264 |
265 |
validation_stats.loc["tb200_feature_neutral_mean", pred_col] = tb200_feature_neutral_mean_era.mean()
266 |
267 |
# Check top and bottom 200 metrics (TB200)
268 |
tb200_validation_correlations = fast_score_by_date(
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
tb200_mean = tb200_validation_correlations.mean()[pred_col]
277 |
tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col]
278 |
tb200_sharpe = tb200_mean / tb200_std
279 |
280 |
validation_stats.loc["tb200_mean", pred_col] = tb200_mean
281 |
validation_stats.loc["tb200_std", pred_col] = tb200_std
282 |
validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe
283 |
284 |
# MMC over validation
285 |
mmc_scores = []
286 |
corr_scores = []
287 |
for _, x in validation_data.groupby(ERA_COL):
288 |
series = neutralize_series(unif(x[pred_col]), (x[example_col]))
289 |
mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29 ** 2))
290 |
291 |
292 |
val_mmc_mean = np.mean(mmc_scores)
293 |
val_mmc_std = np.std(mmc_scores)
294 |
corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
295 |
corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
296 |
297 |
validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean
298 |
validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe
299 |
300 |
# Check correlation with example predictions
301 |
per_era_corrs = validation_data.groupby(ERA_COL).apply(lambda d: unif(d[pred_col]).corr(unif(d[example_col])))
302 |
corr_with_example_preds = per_era_corrs.mean()
303 |
validation_stats.loc["corr_with_example_preds", pred_col] = corr_with_example_preds
304 |
305 |
#Check exposure dissimilarity per era
306 |
tdf = validation_data.groupby(ERA_COL).apply(lambda df: \
307 |
exposure_dissimilarity_per_era(df, pred_col,
308 |
example_col, feature_cols))
309 |
validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean()
310 |
311 |
# .transpose so that stats are columns and the model_name is the row
312 |
return validation_stats.transpose()