hellno-o yifanxie commited on
Commit
69c363f
·
0 Parent(s):

Duplicate from yifanxie/numerdash

Browse files

Co-authored-by: Yifan Xie <[email protected]>

.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ base = "dark"
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: NumerDash
3
+ emoji: 🚀🔥
4
+ sdk: streamlit
5
+ app_file: numerdash_app.py
6
+ pinned: false
7
+ duplicated_from: yifanxie/numerdash
8
+ ---
default_models.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "benchmark": [
3
+ "integration_test",
4
+ "integration_test_7"
5
+ ]
6
+ }
numerdash_app.py ADDED
@@ -0,0 +1,975 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import os
5
+ import sys
6
+ sys.path.append(os.path.dirname(os.getcwd()))
7
+ from project_tools import project_utils, project_config, numerapi_utils
8
+ import warnings
9
+ import plotly.express as px
10
+ import json
11
+ warnings.filterwarnings("ignore")
12
+ from PIL import Image
13
+ import plotly.express as px
14
+ import plotly.graph_objects as go
15
+ from plotly.subplots import make_subplots
16
+ from streamlit import caching
17
+ import time
18
+ import traceback
19
+ import datetime
20
+
21
+ st.set_page_config(layout='wide')
22
+ get_benchmark_data = True
23
+
24
+ # get_dailyscore = True
25
+
26
+
27
+
28
+
29
+ def sidebar_data_picker():
30
+ st.sidebar.subheader('Model Data Picker')
31
+ top_lb = st.sidebar.checkbox('top LB by corr', value=True)
32
+ top_tp3m = st.sidebar.checkbox('most profitable 3 month', value=True)
33
+ top_tp1y = st.sidebar.checkbox('most profitable 1 year', value=True)
34
+ special_list = st.sidebar.checkbox('model from specific users', value=True)
35
+ return top_lb, top_tp3m, top_tp1y, special_list
36
+
37
+
38
+ # to be removed
39
+ def model_data_picker_bak(values = None):
40
+ if values is None:
41
+ values = [True, True, True, True, True, True]
42
+ model_dict = {}
43
+ st.sidebar.subheader('Model Data Picker')
44
+ # top_lb = st.sidebar.checkbox('top LB by corr', value=values[0])
45
+ # top_tp3m = st.sidebar.checkbox('most profitable 3 month', value=values[1])
46
+ top_tp1y = st.sidebar.checkbox('most profitable 1 year', value=values[2])
47
+ special_list = st.sidebar.checkbox('model from specific users', value=values[3])
48
+ benchmark_list = st.sidebar.checkbox('benchmark models', value=values[4])
49
+ default_list = st.sidebar.checkbox('default models', value=values[5])
50
+ # if top_lb:
51
+ # model_dict['top_corr'] = project_config.TOP_LB
52
+ # if top_tp3m:
53
+ # model_dict['top_3m'] = project_config.TP3M
54
+ if top_tp1y:
55
+ model_dict['top_1y'] = project_config.TP1Y
56
+ if benchmark_list:
57
+ model_dict['benchmark'] = project_config.BENCHMARK_MODELS
58
+ if special_list:
59
+ model_dict['iaai'] = project_config.IAAI_MODELS
60
+ # model_dict['arbitrage'] = project_config.ARBITRAGE_MODELS
61
+ # model_dict['mm'] = project_config.MM_MODELS
62
+ # model_dict['restrade'] = project_config.RESTRADE_MODELS
63
+
64
+ if default_list:
65
+ model_dict['yx'] = project_config.MODEL_NAMES + project_config.NEW_MODEL_NAMES
66
+ model_dict['mcv'] = project_config.MCV_MODELS + project_config.MCV_NEW_MODELS
67
+ return model_dict
68
+
69
+
70
+ # to be removed
71
+ def model_fast_picker_bak(models):
72
+ text_content = '''
73
+ fast model picker by CSV string.
74
+ example: "model1, model2, model3"
75
+ '''
76
+ text = st.sidebar.text_area(text_content)
77
+ result_models = []
78
+ if len(text)>0:
79
+ csv_parts = text.split(',')
80
+ for s in csv_parts:
81
+ m = s.strip()
82
+ if m in models:
83
+ result_models.append(m)
84
+ return list(dict.fromkeys(result_models))
85
+
86
+
87
+
88
+ def default_model_picker():
89
+ picked_models = {}
90
+ if os.path.isfile('default_models.json'):
91
+ default_models_dict = project_utils.load_json('default_models.json')
92
+ for key in default_models_dict.keys():
93
+ picked_models[key] = default_models_dict[key]
94
+ if os.path.isfile('user_models.json'):
95
+ user_models_dict = project_utils.load_json('user_models.json')
96
+ for key in user_models_dict.keys():
97
+ picked_models[key] = user_models_dict[key]
98
+ return picked_models
99
+
100
+
101
+ def model_fast_picker(model_list):
102
+ text_content = '''
103
+ fast model picker by CSV string.
104
+ example: "model1, model2, model3"
105
+ '''
106
+ text = st.sidebar.text_area(text_content)
107
+ result_models = []
108
+ if len(text)>0:
109
+ csv_parts = text.split(',')
110
+ for s in csv_parts:
111
+ m = s.strip()
112
+ if (m in model_list): #and (m not in preselected_models):
113
+ result_models.append(m)
114
+ return list(dict.fromkeys(result_models))
115
+
116
+
117
+
118
+
119
+
120
+
121
+ def generate_round_table(data, row_cts, c, r, sortcol='corrmmc'):
122
+ # rounds = data
123
+ # row_cts[c].write(2*r+c)
124
+ latest_round = int(data['roundNumber'].max())
125
+ earliest_round = int(data['roundNumber'].min())
126
+ suggest_round = int(latest_round - (2*r+c))
127
+ select_round = row_cts[c].slider('select a round', earliest_round, latest_round, suggest_round, 1)
128
+ # row_cts[c].write(select_round)
129
+ round_data = data[data['roundNumber']==select_round].sort_values(by=sortcol, ascending=False).reset_index(drop=True)
130
+ round_resolved_time = round_data['roundResolveTime'][0]
131
+ # round_data = round_data[round_data['model'].isin(models)].reset_index(drop=True)
132
+ # latest_date = round_data['date'].values[0]
133
+ row_cts[c].write(f'round: {select_round} resolved time: {round_resolved_time}')
134
+ row_cts[c].dataframe(round_data.drop(['roundNumber', 'roundResolveTime'], axis=1), height=max_table_height-100)
135
+
136
+
137
+
138
+
139
+
140
+
141
+ def generate_dailyscore_metrics(data, row_cts, c, r):
142
+ # row_cts[c].write([r, c, 2*r+c])
143
+ select_metric = row_cts[c].selectbox("", list(id_metric_opt.keys()), index=2*r+c, format_func=lambda x: id_metric_opt[x])
144
+ latest_round = int(data['roundNumber'].max())
145
+ earliest_round = int(data['roundNumber'].min())
146
+ score = id_metric_score_dic[select_metric]
147
+ df = project_utils.calculate_rounddailysharpe_dashboard(data, latest_round, earliest_round, score).sort_values(by='sos', ascending=False)
148
+ row_cts[c].dataframe(df, height=max_table_height-100)
149
+ pass
150
+
151
+ def get_roundmetric_data(data):
152
+ numfeats1 = ['corr', 'mmc', 'tc', 'corrmmc', 'corrtc', 'fncV3', 'fncV3_pct']
153
+ stat1 = ['sum', 'mean', 'count',
154
+ {'sharpe': project_utils.get_array_sharpe}] # {'ptp':np.ptp}]#{'sharp':project_utils.get_array_sharpe}]
155
+ numfeats2 = ['corr_pct', 'mmc_pct', 'tc_pct','corrtc_avg_pct', 'corrmmc_avg_pct']
156
+ stat2 = ['mean']#, {'sharp': project_utils.get_array_sharpe}]
157
+
158
+ roundmetric_agg_rcp = [
159
+ [['model'], numfeats1, stat1],
160
+ [['model'], numfeats2, stat2]
161
+ ]
162
+
163
+ res = project_utils.groupby_agg_execution(roundmetric_agg_rcp, data)['model']
164
+ rename_dict = {}
165
+ for c in res.columns.tolist():
166
+ if c != 'model':
167
+ rename_dict[c] = c[6:] # remove 'model_' in column name
168
+ res.rename(columns = rename_dict, inplace=True)
169
+ return res
170
+
171
+
172
+ def generate_round_metrics(data, row_cts, c, r):
173
+ select_metric = row_cts[c].selectbox("", list(roundmetric_opt.keys()), index=2*r+c, format_func=lambda x: roundmetric_opt[x])
174
+ cols = ['model']
175
+ # st.write(select_metric)
176
+ # st.write(data.columns.tolist())
177
+ for col in data.columns.tolist():
178
+ if select_metric =='corrmmc':
179
+ if (f'{select_metric}_' in col) or ('corrmmc_avg_' in col):
180
+ cols += [col]
181
+ elif select_metric =='corrtc':
182
+ if (f'{select_metric}_' in col) or ('corrtc_avg_' in col):
183
+ cols += [col]
184
+ else:
185
+ # if (f'{select_metric}_' in col) and (not('corrmmc' in col)) and (not('corrtc' in col)):
186
+ if (f'{select_metric}_' in col):
187
+ cols+= [col]
188
+
189
+ if select_metric != 'pct':
190
+ sort_col = select_metric+'_sharpe'
191
+ else:
192
+ sort_col = 'corr_pct_mean'
193
+ view_data = data[cols].sort_values(by=sort_col, ascending=False)
194
+ row_cts[c].dataframe(view_data)
195
+ pass
196
+
197
+
198
+ def dailyscore_chart(data, row_cts, c, r, select_metric):
199
+ latest_round = int(data['roundNumber'].max())
200
+ earliest_round = int(data['roundNumber'].min())
201
+ suggest_round = int(latest_round - (2*r+c))
202
+ select_round = row_cts[c].slider('select a round', earliest_round, latest_round, suggest_round, 1)
203
+ data = data[data['roundNumber']==select_round]
204
+ if len(data)>0:
205
+ fig = chart_pxline(data, 'date', y=select_metric, color='model', hover_data=list(histtrend_opt.keys()))
206
+ row_cts[c].plotly_chart(fig, use_container_width=True)
207
+ else:
208
+ row_cts[c].info('no data was found for the selected round')
209
+ pass
210
+
211
+
212
+ def generate_live_round_stake(data, row_cts, c, r):
213
+ latest_round = int(data['roundNumber'].max())
214
+ select_round = int(latest_round - (2*r+c))
215
+ select_data = data[data['roundNumber']==select_round].reset_index(drop=True)
216
+ if len(select_data)>0:
217
+ payout_sum = select_data['payout'].sum().round(3)
218
+ stake_sum = select_data['stake'].sum().round(3)
219
+ if payout_sum >= 0:
220
+ payout_color = 'green'
221
+ else:
222
+ payout_color = 'red'
223
+
224
+ space = '&nbsp;'*5
225
+ content_str = f'#### Round: {select_round}{space}Stake: {stake_sum}{space}Payout: <span style="color:{payout_color}">{payout_sum}</span> NMR'
226
+ row_cts[c].markdown(content_str, unsafe_allow_html=True)
227
+ select_data = select_data.drop(['roundNumber'], axis=1).sort_values(by='payout', ascending=False)
228
+ row_cts[c].dataframe(select_data, height=max_table_height-100)
229
+
230
+
231
+
232
+ def round_view(data, select_perview, select_metric=None):
233
+ num_cols = 2
234
+ num_rows = 2
235
+ for r in range(num_rows):
236
+ row_cts = st.columns(num_cols)
237
+ for c in range(num_cols):
238
+ if select_perview=='round_result':
239
+ generate_round_table(data, row_cts, c, r)
240
+ if select_perview=='dailyscore_metric':
241
+ generate_dailyscore_metrics(data, row_cts, c, r)
242
+ if select_perview=='metric_view':
243
+ generate_round_metrics(data, row_cts, c, r)
244
+ if select_perview=='dailyscore_chart':
245
+ dailyscore_chart(data, row_cts, c, r, select_metric)
246
+ if select_perview=='live_round_stake':
247
+ generate_live_round_stake(data, row_cts, c, r)
248
+
249
+
250
+ def score_overview():
251
+ if 'model_data' in st.session_state:
252
+ data = st.session_state['model_data'].copy()
253
+ data = data.drop_duplicates(['model', 'roundNumber'], keep='first')
254
+ roundview = st.expander('round performance overview', expanded=True)
255
+ with roundview:
256
+ round_view(data, 'round_result')
257
+ else:
258
+ st.write('model data missing, please go to the Dowanload Score Data section to download model data first')
259
+
260
+ def metric_overview():
261
+ if 'model_data' in st.session_state:
262
+ data = st.session_state['model_data'].copy()
263
+ st.subheader('Select Round Data')
264
+ latest_round = int(data['roundNumber'].max())
265
+ earliest_round = int(data['roundNumber'].min())
266
+ if (latest_round - earliest_round) > 10:
267
+ # suggest_round = int(latest_round - (latest_round - earliest_round) / 2)
268
+ suggest_round = 280
269
+ else:
270
+ suggest_round = earliest_round
271
+ select_rounds = st.slider('select a round', earliest_round, latest_round, (suggest_round, latest_round - 1), 1)
272
+ data=data.drop_duplicates(['model', 'roundNumber'], keep='first')
273
+ data = data[(data['roundNumber'] >= select_rounds[0]) & (data['roundNumber'] <= select_rounds[1])].reset_index(drop=True)
274
+ roundmetrics_data = get_roundmetric_data(data)
275
+ min_count = int(roundmetrics_data['count'].min())
276
+ max_count = int(roundmetrics_data['count'].max())
277
+ if min_count < max_count:
278
+ select_minround = st.sidebar.slider('miminum number of rounds', min_count, max_count, min_count, 1)
279
+ else:
280
+ select_minround = min_count
281
+ roundmetrics_data = roundmetrics_data[roundmetrics_data['count'] >= select_minround].reset_index(drop=True)
282
+ metricview_exp = st.expander('metric overview', expanded=True)
283
+ dataview_exp = st.expander('full data view', expanded=False)
284
+ with metricview_exp:
285
+ round_view(roundmetrics_data, 'metric_view')
286
+ with dataview_exp:
287
+ st.write(roundmetrics_data)
288
+ else:
289
+ st.write('model data missing, please go to the Dowanload Score Data section to download model data first')
290
+
291
+
292
+ def data_operation():
293
+ # top_lb, top_tp3m, top_tp1y, special_list = sidebar_data_picker()
294
+ full_model_list = st.session_state['models']
295
+ latest_round = project_utils.latest_round
296
+ models = []
297
+ benchmark_opt = st.sidebar.checkbox('download default models', value=True)
298
+ if benchmark_opt:
299
+ model_dict = default_model_picker()
300
+ for k in model_dict.keys():
301
+ models += model_dict[k]
302
+ models = models + model_fast_picker(full_model_list)
303
+ if len(models)>0:
304
+ model_selection = st.multiselect('select models', st.session_state['models'], default=models)
305
+ suggest_min_round = 182 #latest_round-50
306
+ min_round, max_round = st.slider('select tournament rounds', 200, latest_round, (suggest_min_round, latest_round), 1)
307
+ roundlist = [i for i in range(max_round, min_round-1, -1)]
308
+ download = st.button('download data of selected models')
309
+ st.sidebar.subheader('configuration')
310
+ show_info=st.sidebar.checkbox('show background data', value=False)
311
+ # update_numeraiti_data = st.sidebar.checkbox('update numerati data', value=True)
312
+ # update_model_data = st.sidebar.checkbox('update model data', value=True)
313
+ # update_model_data =
314
+
315
+ model_df = get_saved_data()
316
+ if download and len(model_selection)>0:
317
+ # if update_model_data:
318
+ with st.spinner('downloading model round results'):
319
+ model_df = []
320
+ model_df = download_model_round_result(model_selection, roundlist, show_info)
321
+
322
+ prjreload = st.sidebar.button('reload config')
323
+ if prjreload:
324
+ project_utils.reload_project()
325
+ if len(model_df)>0:
326
+ rename_dict = {'corrPercentile': 'corr_pct', 'correlation':'corr', 'corrWMetamodel':'corr_meta', 'mmcPercentile':'mmc_pct', 'tcPercentile':'tc_pct', 'fncV3Percentile':'fncV3_pct'}
327
+ model_df.rename(columns=rename_dict, inplace=True)
328
+ model_df['corrmmc'] = model_df['corr'] + model_df['mmc']
329
+ model_df['corrmmc_avg_pct'] = (model_df['corr_pct'] + model_df['mmc_pct'])/2
330
+ model_df['corrtc'] = model_df['corr'] + model_df['tc']
331
+ model_df['corrtc_avg_pct'] = (model_df['corr_pct'] + model_df['tc_pct'])/2
332
+ # st.write(model_df.head(5))
333
+ # ord_cols = ['model','corr', 'mmc', 'tc', 'corrmmc', 'corrtc', 'corr_pct', 'tc_pct', 'corrtc_avg_pct','corr_meta', 'mmc_pct', 'corrmmc_avg_pct', 'roundNumber', 'roundResolveTime']
334
+ ord_cols = ['model','corr', 'tc', 'corrtc', 'corr_pct', 'tc_pct', 'corrtc_avg_pct','corr_meta', 'fncV3', 'fncV3_pct','corrmmc_avg_pct', 'roundNumber', 'roundResolveTime', 'mmc', 'corrmmc','mmc_pct']
335
+
336
+ model_df = model_df[ord_cols]
337
+ if project_config.SAVE_LOCAL_COPY:
338
+ try:
339
+ project_utils.pickle_data(project_config.MODEL_ROUND_RESULT_FILE, model_df)
340
+ except:
341
+ pass
342
+ st.session_state['model_data'] = model_df
343
+
344
+ if show_info:
345
+ st.text('list of models being tracked')
346
+ st.write(model_dict)
347
+ try:
348
+ dshape = st.session_state['model_data'].shape
349
+ st.write(f'downloaded model result data shape is {dshape}')
350
+ st.write(model_df)
351
+ except:
352
+ st.write('model data was not retrieved')
353
+
354
+ if len(model_df)>0:
355
+ get_performance_data_status(model_df)
356
+ return None
357
+
358
+ def get_saved_data():
359
+ res = []
360
+ if os.path.isfile(project_config.MODEL_ROUND_RESULT_FILE):
361
+ res = project_utils.load_data(project_config.MODEL_ROUND_RESULT_FILE)
362
+ st.session_state['model_data'] = res
363
+ return res
364
+
365
+ def get_performance_data_status(df):
366
+ st.sidebar.subheader('model data summary')
367
+ # latest_date = df['date'][0].strftime(project_config.DATETIME_FORMAT3)
368
+ model_num = df['model'].nunique()
369
+ round_num = df['roundNumber'].nunique()
370
+ latest_round = df['roundNumber'].max()
371
+ # st.sidebar.text(f'latest date: {latest_date}')
372
+ st.sidebar.text(f'number of models: {model_num}')
373
+ st.sidebar.text(f'number of rounds: {round_num}')
374
+ st.sidebar.text(f'latest round: {latest_round}')
375
+ return None
376
+
377
+
378
+ def download_model_round_result(models, roundlist, show_info):
379
+ model_df = []
380
+ model_dfs = []
381
+ my_bar = st.progress(0.0)
382
+ my_bar.progress(0.0)
383
+ percent_complete = 0.0
384
+ for i in range(len(models)):
385
+ message = ''
386
+ try:
387
+ model_res = numerapi_utils.daily_submissions_performances_V3(models[i])
388
+ if len(model_res) > 0:
389
+ cols = ['model'] + list(model_res[0].keys())
390
+ model_df = pd.DataFrame(model_res)
391
+ model_df['model'] = models[i]
392
+ model_df = model_df[cols]
393
+ model_dfs.append(model_df)
394
+ else:
395
+ message = f'no result found for model {models[i]}'
396
+ except Exception:
397
+ # if show_info:
398
+ # st.write(f'error while getting result for {models[i]}')
399
+ except_msg = traceback.format_exc()
400
+ message = f'error while getting result for {models[i]}: {except_msg}'
401
+ if show_info and len(message) > 0:
402
+ st.info(message)
403
+ percent_complete += 1 / len(models)
404
+ if i == len(models) - 1:
405
+ percent_complete = 1.0
406
+ time.sleep(0.1)
407
+ my_bar.progress(percent_complete)
408
+ model_df = pd.concat(model_dfs, axis=0).sort_values(by=['roundNumber'], ascending=False).reset_index(drop=True)
409
+ model_df['roundResolveTime'] = pd.to_datetime(model_df['roundResolveTime'])
410
+ model_df['roundResolveTime'] = model_df['roundResolveTime'].dt.strftime(project_config.DATETIME_FORMAT3)
411
+ model_df = model_df[model_df['roundNumber'].isin(roundlist)].reset_index(drop=True)
412
+ return model_df
413
+
414
+ def chart_pxline(data, x, y, color, hover_data=None, x_range=None):
415
+ fig = px.line(data, x=x, y=y, color=color, hover_data=hover_data)
416
+ fig.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white', height = max_height, margin=dict(l=0, r=10, t=20, b=20))
417
+ fig.update_xaxes(showgrid=False, range=x_range)
418
+ fig.update_yaxes(gridcolor='grey')
419
+ return fig
420
+
421
+
422
+ def roundresult_chart(data, model_selection):
423
+
424
+ round_data = data[data['model'].isin(model_selection)].drop_duplicates(['model', 'roundNumber'], keep='first').reset_index(drop=True)
425
+ min_round = int(round_data['roundNumber'].min())
426
+ max_round = int(round_data['roundNumber'].max())
427
+ suggest_min_round = max_round - 20
428
+ if min_round == max_round:
429
+ min_round = max_round - 20
430
+
431
+ min_selectround, max_selectround = st.slider('select plotting round range', min_round, max_round,
432
+ (suggest_min_round, max_round), 1)
433
+
434
+ select_metric = st.selectbox('Choose a metric', list(histtrend_opt.keys()), index=0,
435
+ format_func=lambda x: histtrend_opt[x])
436
+ round_range = [min_selectround, max_selectround]
437
+ round_list = [r for r in range(min_selectround, max_selectround + 1)]
438
+ round_data = round_data[round_data['roundNumber'].isin(round_list)]
439
+ mean_df = round_data.groupby(['model'])[select_metric].agg('mean').reset_index()
440
+ mean_df[f'model avg.'] = mean_df['model'] + ': ' + mean_df[select_metric].round(5).astype(str)
441
+ mean_df['mean'] = mean_df[select_metric]
442
+ merge_cols = ['model', 'model avg.', 'mean']
443
+ round_data = round_data.merge(right=mean_df[merge_cols], on='model', how='left').sort_values(by=['mean','model', 'roundNumber'], ascending=False)
444
+ fig = chart_pxline(round_data, 'roundNumber', y=select_metric, color='model avg.', hover_data=list(histtrend_opt.keys())+['roundResolveTime'],x_range=round_range)
445
+ if fig is not None:
446
+ st.plotly_chart(fig, use_container_width=True)
447
+
448
+
449
+
450
+
451
+
452
+
453
+ def histtrend():
454
+ # default_models = ['yxbot']
455
+ # models = default_models.copy()
456
+ data = st.session_state['model_data'].copy()
457
+ models = data['model'].unique().tolist()
458
+ model_selection = []
459
+ default_models = model_fast_picker(models)
460
+ if len(models)>0:
461
+ if len(default_models)==0:
462
+ default_models = [models[0]]
463
+ model_selection = st.sidebar.multiselect('select models for chart', models, default=default_models)
464
+
465
+ if len(model_selection)>0:
466
+ roundresult_chart(data, model_selection)
467
+
468
+ # fig = px.line(df, x='roundNumber', y='corr', color='model', hover_data=['corr_pct'])
469
+ # st.write(model_selection)
470
+ else:
471
+ if len(model_selection)==0:
472
+ st.info('please select some models from the dropdown list')
473
+ else:
474
+ st.info('model result data file missing, or no model is selected')
475
+
476
+ # st.write(models)
477
+
478
+
479
+
480
+ def model_evaluation():
481
+ data = st.session_state['model_data'].copy()
482
+ models = data['model'].unique().tolist()
483
+ model_selection = []
484
+ default_models = model_fast_picker(models)
485
+ mean_scale = [-0.05, 0.1]
486
+ count_scale = [1, 50]
487
+ sharpe_scale = [-0.2, 2]
488
+ pct_scale = [0, 1]
489
+ radar_scale = [0, 5]
490
+
491
+ if len(models)>0:
492
+ if len(default_models)==0:
493
+ default_models = [models[0]]
494
+ model_selection = st.sidebar.multiselect('select models for chart', models, default=default_models)
495
+
496
+ if len(model_selection)>0:
497
+ round_data = data[data['model'].isin(model_selection)].drop_duplicates(['model', 'roundNumber'],keep='first').reset_index(drop=True)
498
+ min_round = int(round_data['roundNumber'].min())
499
+ max_round = int(round_data['roundNumber'].max())
500
+ suggest_min_round = max_round - 20
501
+ if min_round == max_round:
502
+ min_round = max_round - 20
503
+
504
+ min_selectround, max_selectround = st.slider('select plotting round range', min_round, max_round,
505
+ (suggest_min_round, max_round), 1)
506
+ round_list = [r for r in range(min_selectround, max_selectround+1)]
507
+ # defaultlist = ['corr_sharpe', 'tc_sharpe', 'corrtc_sharpe','corr_mean', 'tc_mean' 'corrtc_mean', 'corrtc_avg_pct','count']
508
+
509
+ defaultlist = ['corr_sharpe', 'tc_sharpe', 'corrtc_sharpe', 'corr_mean', 'tc_mean', 'corrtc_mean', 'corrtc_avg_pct_mean']
510
+
511
+ select_metrics = st.multiselect('Metric Selection', list(model_eval_opt.keys()),
512
+ format_func=lambda x: model_eval_opt[x], default=defaultlist)
513
+
514
+
515
+ round_data = round_data[round_data['roundNumber'].isin(round_list)].reset_index(drop=True)
516
+ #'need normalised radar chart + tabular view here
517
+ roundmetric_df = get_roundmetric_data(round_data).sort_values(by='corrtc_sharpe', ascending=False).reset_index(drop=True)
518
+
519
+ radarmetric_df = roundmetric_df.copy(deep=True)
520
+ for col in select_metrics:
521
+ if 'mean' in col:
522
+ use_scale = mean_scale
523
+ if 'sharpe' in col:
524
+ use_scale = sharpe_scale
525
+ if 'pct' in col:
526
+ use_scale = pct_scale
527
+ if 'count' in col:
528
+ use_scale = count_scale
529
+ radarmetric_df[col] = radarmetric_df[col].apply(lambda x: project_utils.rescale(x, use_scale, radar_scale))
530
+ select_metrics_name = [model_eval_opt[i] for i in select_metrics]
531
+ radarmetric_df.rename(columns=model_eval_opt, inplace=True)
532
+ roundmetric_df.rename(columns=model_eval_opt, inplace=True)
533
+
534
+ fig = go.Figure()
535
+ for i in range(len(radarmetric_df)):
536
+ fig.add_trace(go.Scatterpolar(
537
+ r=radarmetric_df.loc[i, select_metrics_name].values,
538
+ theta=select_metrics_name,
539
+ fill='toself',
540
+ name=radarmetric_df['model'].values[i]
541
+ ))
542
+
543
+ fig.update_polars(
544
+ radialaxis=dict(visible=True, autorange=False, #type='linear',
545
+ range=[0,5])
546
+ )
547
+
548
+ fig.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='aliceblue',
549
+ height=max_height+100,
550
+ margin=dict(l=0, r=10, t=20, b=20), showlegend=True)
551
+
552
+ st.plotly_chart(fig, use_container_width=True)
553
+ st.text('Calculated Metrics')
554
+ st.dataframe(roundmetric_df[['model'] + select_metrics_name], height=max_table_height)
555
+ st.text('Rescaled Metrics on Chart')
556
+ st.dataframe(radarmetric_df[['model'] + select_metrics_name], height=max_table_height)
557
+
558
+ # st.write(select_metrics)
559
+
560
+
561
+ def get_portfolio_overview(models, onlylatest=True):
562
+ res_df = []
563
+ my_bar = st.progress(0.0)
564
+ my_bar.progress(0.0)
565
+ percent_complete = 0.0
566
+ for i in range(len(models)):
567
+ m = models[i]
568
+ try:
569
+ if onlylatest:
570
+ # mdf = numerapi_utils.get_model_history(m).loc[0:0]
571
+ mdf = numerapi_utils.get_model_history_v3(m).loc[0:0]
572
+ else:
573
+ # mdf = numerapi_utils.get_model_history(m)
574
+ mdf = numerapi_utils.get_model_history_v3(m)
575
+ res_df.append(mdf)
576
+ except:
577
+ # st.info(f'no information for model {m} is available')
578
+ pass
579
+ percent_complete += 1 / len(models)
580
+ if i == len(models) - 1:
581
+ percent_complete = 1.0
582
+ time.sleep(0.1)
583
+ my_bar.progress(percent_complete)
584
+ try:
585
+ res_df = pd.concat(res_df, axis=0)
586
+ res_df['profitability'] = res_df['realised_pl']/(res_df['current_stake']-res_df['realised_pl'])
587
+ cols = ['model', 'date', 'current_stake', 'floating_stake', 'floating_pl', 'realised_pl', 'profitability', 'roundNumber', 'roundResolved', 'payout']
588
+
589
+ # res_df['date'] = res_df['date'].dt.date
590
+ if onlylatest:
591
+ res_df = res_df.sort_values(by='floating_pl', ascending=False).reset_index(drop=True)
592
+ return res_df[cols]
593
+ else:
594
+ return res_df[cols]
595
+ except:
596
+ return []
597
+
598
+
599
+ def get_stake_type(corr, mmc):
600
+ if mmc>0:
601
+ res = str(int(corr)) + 'xCORR ' + str(int(mmc)) +'xMMC'
602
+ else:
603
+ res = '1xCORR'
604
+ return res
605
+
606
+
607
+ @st.cache(suppress_st_warning=True)
608
+ def get_stake_by_liverounds(models):
609
+ latest_round_id = int(project_utils.get_latest_round_id())
610
+ roundlist = [i for i in range(latest_round_id, latest_round_id - 5, -1)]
611
+ res = []
612
+ my_bar = st.progress(0.0)
613
+ my_bar.progress(0.0)
614
+ percent_complete = 0.0
615
+ percent_part = 0
616
+ for r in roundlist:
617
+ for m in models:
618
+ percent_complete += 1 / (len(models)*len(roundlist))
619
+ try:
620
+ data = numerapi_utils.get_round_model_performance(r, m)
621
+ # print(f'successfuly extract for model {m} in round {r}')
622
+ res.append(data)
623
+ except:
624
+ pass
625
+ # print(f'no result found for model {m} in round {r}')
626
+ if percent_part == (len(models)*len(roundlist)) - 1:
627
+ percent_complete = 1.0
628
+ time.sleep(0.1)
629
+ my_bar.progress(percent_complete)
630
+ percent_part +=1
631
+ res_df = pd.DataFrame.from_dict(res).fillna(0)
632
+ res_df['payoutPending'] = res_df['payoutPending'].astype(np.float64)
633
+ res_df['selectedStakeValue'] = res_df['selectedStakeValue'].astype(np.float64)
634
+ res_df['stake_type'] = res_df.apply(lambda x: get_stake_type(x['corrMultiplier'], x['mmcMultiplier']),axis=1)
635
+ rename_dict = {'selectedStakeValue': 'stake', 'payoutPending': 'payout', 'correlation':'corr'}
636
+ res_df = res_df.rename(columns=rename_dict)
637
+ col_ord = ['model', 'roundNumber', 'stake', 'payout', 'stake_type', 'corr', 'mmc']
638
+ return res_df[col_ord]
639
+
640
+
641
+
642
+ def get_stake_graph(data):
643
+ numfeats = ['current_stake', 'floating_stake', 'floating_pl', 'realised_pl']
644
+ stat1 = ['sum']
645
+ agg_rcp = [[['date'], numfeats, stat1]]
646
+
647
+ select_opt = st.selectbox('Select Time Span', list(stakeoverview_plot_opt.keys()), index=1, format_func=lambda x: stakeoverview_plot_opt[x])
648
+
649
+ res = project_utils.groupby_agg_execution(agg_rcp, data)['date']
650
+ w5delta = datetime.timedelta(weeks=5)
651
+ w13delta = datetime.timedelta(weeks=13)
652
+ date_w5delta = res['date'].max() - w5delta
653
+ date_w13delta = res['date'].max() - w13delta
654
+ y1delta = datetime.timedelta(weeks=52)
655
+ date_y1delta = res['date'].max() - y1delta
656
+
657
+ rename_dict = {'date_current_stake_sum': 'total_stake', 'date_floating_stake_sum': 'floating_stake',
658
+ 'date_floating_pl_sum': 'floating_pl', 'date_realised_pl_sum': 'realised_pl'}
659
+ res = res.rename(columns=rename_dict)
660
+ if select_opt == '1month':
661
+ res = res[res['date']>date_w5delta]
662
+ elif select_opt=='3month':
663
+ res = res[res['date']>date_w13delta]
664
+ elif select_opt=='1year':
665
+ res = res[res['date']>date_y1delta]
666
+ else:
667
+ pass
668
+
669
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
670
+ fig.add_trace( go.Scatter(x=res['date'], y=res['floating_stake'], name="floating_stake"), secondary_y=False,)
671
+
672
+ fig.add_trace(go.Scatter(x=res['date'], y=res['total_stake'], name="total_stake"),secondary_y=False,)
673
+
674
+ fig.add_trace(go.Scatter(x=res['date'], y=res['realised_pl'], name="realised_pl"),secondary_y=True,)
675
+ fig.update_layout(plot_bgcolor='black', paper_bgcolor='black', font_color='white')
676
+ fig.update_xaxes(showgrid=False, range=None, nticks=30)
677
+ fig.update_yaxes(gridcolor='grey', title_text="total stake/floating stake/realised PL", secondary_y=False)
678
+ fig.update_yaxes(showgrid=False, title_text="realised PL", zeroline=False,secondary_y=True)
679
+ st.plotly_chart(fig, use_container_width=True)
680
+
681
+ #
682
+ # def live_round_stakeview(data):
683
+ # models = data
684
+ # latest_round_id = int(project_utils.get_latest_round_id())
685
+ # roundlist = [i for i in range(latest_round_id, latest_round_id-4, -1]
686
+
687
+
688
+ def check_session_state(key):
689
+ # st.write(data)
690
+ if key in st.session_state:
691
+ return st.session_state[key]
692
+ else:
693
+ return None
694
+
695
+
696
+ def stake_overview():
697
+ # data = st.session_state['models'].copy()
698
+ models = st.session_state['models'].copy()
699
+ model_selection = []
700
+ baseline_models = []
701
+ model_dict = default_model_picker()
702
+ for k in model_dict.keys():
703
+ baseline_models += model_dict[k]
704
+
705
+ default_models = model_fast_picker(models)
706
+
707
+ if len(models)>0:
708
+ # if len(default_models)==0:
709
+ # default_models = baseline_models[0]
710
+ model_selection = st.sidebar.multiselect('select models for chart', models, default=default_models)
711
+
712
+ redownload_data = False
713
+ # download = st.sidebar.button('download stake data')
714
+ if len(model_selection) > 0:
715
+ if 'stake_df' not in st.session_state:
716
+ redownload_data = True
717
+ else:
718
+ if set(model_selection)!=st.session_state['stake_overview_models']:
719
+ redownload_data = True
720
+ else:
721
+ ovdf = st.session_state['stake_df']
722
+ if redownload_data:
723
+ ovdf = get_portfolio_overview(model_selection, onlylatest=False)
724
+ st.session_state['stake_df'] = ovdf
725
+ st.session_state['stake_overview_models'] = set(ovdf['model'].unique().tolist())
726
+
727
+ chartdf = ovdf.copy(deep=True)
728
+ ovdf = ovdf.drop_duplicates('model', keep='first')
729
+ ovdf = ovdf.sort_values(by='floating_pl', ascending=False).reset_index(drop=True)
730
+ if len(ovdf) > 0:
731
+ overview_cols = ['model', 'current_stake', 'floating_stake', 'floating_pl', 'realised_pl']
732
+ date_text = datetime.datetime.now().strftime(project_config.DATETIME_FORMAT3)
733
+ ovdf.drop(['date'], axis=1, inplace=True)
734
+ stake_cts = st.columns(2)
735
+ pl_cts = st.columns(2)
736
+ date_label = st.empty()
737
+ get_stake_graph(chartdf)
738
+ ovdf_exp = st.expander('stake data overview', expanded=True)
739
+ with ovdf_exp:
740
+ st.dataframe(ovdf[overview_cols], height=max_table_height)
741
+ total_current_stake = round(ovdf['current_stake'].sum(), 3)
742
+ total_floating_stake = round(ovdf['floating_stake'].sum(), 3)
743
+ rpl = round(ovdf['realised_pl'].sum(), 3)
744
+ fpl = round(ovdf['floating_pl'].sum(), 3)
745
+ current_stake_str = f'### Stake Balance: {total_current_stake:0.3f} NMR'
746
+ float_stake_str = f'### Floating Balance: {total_floating_stake:0.3f} NMR'
747
+ if rpl >= 0:
748
+ real_pl_color = 'green'
749
+ else:
750
+ real_pl_color = 'red'
751
+ if fpl >= 0:
752
+ float_pl_color = 'green'
753
+ else:
754
+ float_pl_color = 'red'
755
+ real_pl_str = f'### Realised P/L: <span style="color:{real_pl_color}">{rpl}</span> NMR'
756
+ float_pl_str = f'### Floating P/L: <span style="color:{float_pl_color}">{fpl}</span> NMR'
757
+ stake_cts[0].markdown(current_stake_str, unsafe_allow_html=True)
758
+ stake_cts[1].markdown(float_stake_str, unsafe_allow_html=True)
759
+ pl_cts[0].markdown(real_pl_str, unsafe_allow_html=True)
760
+ pl_cts[1].markdown(float_pl_str, unsafe_allow_html=True)
761
+ date_label.subheader(f'Date: {date_text}')
762
+ if st.sidebar.checkbox('show breakdown by live rounds', value=False):
763
+ liveround_exp = st.expander('show breakdown by live rounds (requires extra data downloading)',expanded=True)
764
+ with liveround_exp:
765
+ stake_models = ovdf['model'].tolist()
766
+ liveround_stake_df = get_stake_by_liverounds(stake_models)
767
+ round_view(liveround_stake_df,'live_round_stake')
768
+ if st.sidebar.checkbox('show resolved round summary', value=False):
769
+ resolvedround_exp = st.expander('show resolved rounds summary for selected model group', expanded=True)
770
+ with resolvedround_exp:
771
+ get_roundresolve_history(chartdf)
772
+ # st.write(chartdf)
773
+
774
+
775
+ def get_roundresolve_history(data):
776
+ resolved_rounds = data[data['roundResolved'] == True]['roundNumber'].unique().tolist()
777
+ rsdf = data[data['roundResolved'] == True].reset_index(drop=True)
778
+ rs_date = rsdf[['date', 'roundNumber']].drop_duplicates('roundNumber').reset_index(drop=True)
779
+ numfeats = ['current_stake', 'payout']
780
+ stat1 = ['sum']
781
+ agg_rcp = [[['roundNumber'], numfeats, stat1]]
782
+ res = project_utils.groupby_agg_execution(agg_rcp, rsdf)['roundNumber'].sort_values(by='roundNumber',
783
+ ascending=False)
784
+ res = res.merge(right=rs_date, on='roundNumber')
785
+
786
+ rename_dict = {'roundNumber': 'Round', 'roundNumber_current_stake_sum': 'Total Stake',
787
+ 'roundNumber_payout_sum': 'Round P/L', 'date': 'Resolved Date'}
788
+ res.rename(columns=rename_dict, inplace=True)
789
+ st.write(res)
790
+
791
+
792
+
793
+
794
+ def app_setting():
795
+ pfm_exp = st.expander('Perormance Data Setting', expanded=True)
796
+ with pfm_exp:
797
+ pfm_default_model= st.checkbox('download data for default model', value=True)
798
+
799
+ stake_exp = st.expander('stake overview data setting', expanded=True)
800
+ if st.button('confirm settiong'):
801
+ st.session_state['pfm_default_model'] = pfm_default_model
802
+
803
+
804
+
805
+ def performance_overview():
806
+ # st.sidebar.subheader('Choose a Table View')
807
+ select_app = st.sidebar.selectbox("", list(pfm_opt.keys()), index=0, format_func=lambda x: pfm_opt[x])
808
+ if select_app=='data_op':
809
+ data_operation()
810
+ if select_app=='liveround_view':
811
+ score_overview()
812
+ if select_app=='metric_view':
813
+ metric_overview()
814
+ if select_app=='historic_trend':
815
+ histtrend()
816
+ if select_app=='model_evaluation':
817
+ model_evaluation()
818
+
819
+
820
+
821
+ def show_content():
822
+ st.sidebar.header('Dashboard Selection')
823
+ select_app = st.sidebar.selectbox("", list(app_opt.keys()), index=1, format_func=lambda x: app_opt[x])
824
+ if select_app=='performance_overview':
825
+ performance_overview()
826
+ if select_app=='stake_overview':
827
+ stake_overview()
828
+ if select_app=='app_setting':
829
+ app_setting()
830
+
831
+
832
+ # main body
833
+ # various configuration setting
834
+ app_opt = {
835
+ 'performance_overview' : 'Performance Overview',
836
+ 'stake_overview': 'Stake Overview',
837
+ # 'app_setting':''
838
+ }
839
+
840
+
841
+ pfm_opt = {
842
+ 'data_op': 'Download Score Data',
843
+ 'liveround_view': 'Round Overview',
844
+ 'metric_view':'Metric Overview',
845
+ 'historic_trend': 'Historic Trend',
846
+ 'model_evaluation': 'Model Evaluation',
847
+ }
848
+
849
+
850
+
851
+ tbl_opt = {
852
+ 'round_result':'Round Results',
853
+ 'dailyscore_metric':'Daily Score Metrics',
854
+ 'round_metric' : 'Round Metrics'
855
+ }
856
+
857
+ id_metric_opt = {
858
+ 'id_corr_sharpe':'Daily Score corr sharpe',
859
+ 'id_mmc_sharpe': 'Daily Score mmc sharpe',
860
+ 'id_corrmmc_sharpe': 'Daily Score corrmmc sharpe',
861
+ 'id_corr2mmc_sharpe': 'Daily Score corr2mmc sharpe',
862
+ 'id_corrmmcpct_sharpe': 'Daily Score corrmmc avg pct sharpe',
863
+ 'id_corr2mmcpct_sharpe': 'Daily Score corr2mmc avg pct sharpe',
864
+ 'id_corrpct_sharpe':'Daily Score corr pct sharpe',
865
+ 'id_mmcpct_sharpe': 'Daily Score mmc pct sharpe',
866
+ }
867
+
868
+
869
+ id_metric_score_dic = {
870
+ 'id_corr_sharpe':'corr',
871
+ 'id_mmc_sharpe': 'mmc',
872
+ 'id_corrmmc_sharpe': 'corrmmc',
873
+ 'id_corr2mmc_sharpe': 'corr2mmc',
874
+ 'id_corrmmcpct_sharpe': 'cmavg_pct',
875
+ 'id_corr2mmcpct_sharpe': 'c2mavg_pct',
876
+ 'id_corrpct_sharpe':'corr_pct',
877
+ 'id_mmcpct_sharpe': 'mmc_pct'
878
+ }
879
+
880
+
881
+ roundmetric_opt ={'corr':'Corr metrics',
882
+ 'tc': 'TC metrics',
883
+ 'corrtc': 'CorrTC metrics',
884
+ 'fncV3': 'FNCV3 metrics',
885
+ 'pct': 'Pecentage metrics',
886
+ 'corrmmc' : 'CorrMMC metrics',
887
+ 'mmc': 'MMC metrics'
888
+ }
889
+
890
+
891
+ histtrend_opt = {
892
+ 'corr':'Correlation',
893
+ 'mmc': 'MMC',
894
+ 'tc' : 'TC',
895
+ 'corr_pct': 'Correlation Percentile',
896
+ 'tc_pct' : 'TC Percentile',
897
+ 'mmc_pct':'MMC Percentile',
898
+ 'corrmmc': 'Correlation+MMC',
899
+ 'corrtc': 'Correlation+TC',
900
+ 'corrtc_avg_pct': 'Correlation+TC Average Percentile',
901
+ 'corrmmc_avg_pct': 'Correlation+MMC Average Percentile',
902
+
903
+ }
904
+
905
+
906
+ model_eval_opt = {
907
+ 'corr_sharpe' : 'Correlation Sharpe',
908
+ 'mmc_sharpe' : 'MMC Sharpe',
909
+ 'tc_sharpe' : 'TC Sharpe',
910
+ 'corrtc_sharpe': 'Correlation+TC Sharpe',
911
+ 'corrmmc_sharpe' : 'Correlation+MMC Sharpe',
912
+ 'corr_mean':'Avg. Correlation',
913
+ 'tc_mean': 'Avg. TC',
914
+ 'count': 'Number of Rounds',
915
+ 'mmc_mean':'Avg. MMC',
916
+ 'corrtc_mean': 'Avg. Correlation+TC',
917
+ 'corrmmc_mean': 'Avg. Correlation+MMC',
918
+ 'corr_pct_mean': 'Avg. Correlation Percentile',
919
+ 'mmc_pct_mean': 'Avg. MMC Percentile',
920
+ 'corrmmc_avg_pct_mean': 'Avg. Correlation+MMC Percentile',
921
+ 'corrtc_avg_pct_mean': 'Avg. Correlation+TC Percentile',
922
+ }
923
+
924
+ stakeoverview_plot_opt = {
925
+ '1month':'1 Month',
926
+ '3month':'3 Months',
927
+ '1year':'1 Year',
928
+ 'all':'Display all available data'
929
+ }
930
+
931
+ def show_session_status_info():
932
+ # 'raw_performance_data'
933
+ key1 = 'model_data'
934
+ key2 = 'models'
935
+ if check_session_state(key1) is None:
936
+ st.write(f'{key1} is None')
937
+ else:
938
+ st.write(f'{key1} shape is {st.session_state[key1].shape}')
939
+
940
+ if check_session_state(key2) is None:
941
+ st.write(f'{key2} is None')
942
+ else:
943
+ st.write(f'{key2} list has {len(st.session_state[key2])} models')
944
+ pass
945
+
946
+
947
+
948
+ project_utils.reload_project()
949
+
950
+ height_exp = st.sidebar.expander('Plots and tables setting', expanded=False)
951
+ with height_exp:
952
+ max_height = st.slider('Please choose the height for plots', 100, 1000, 400, 50)
953
+ max_table_height = st.slider('Please choose the height for tables', 100, 1000, 500, 50)
954
+
955
+
956
+ st.title('Numerai Dashboard')
957
+ # key = 'pfm_default_model'
958
+ # if check_session_state('pfm_default_model') is None:
959
+ # st.write('set value')
960
+ # st.session_state['pfm_default_model'] = True
961
+ # else:
962
+ # st.write('use set value')
963
+ #
964
+ # st.write(st.session_state)
965
+
966
+ df = get_saved_data()
967
+
968
+ if check_session_state('models') is None:
969
+ with st.spinner('updating model list'):
970
+ st.session_state['models'] = numerapi_utils.get_lb_models()
971
+
972
+ # debug purpose only
973
+ # show_session_status_info()
974
+
975
+ show_content()
project_tools/__init__.py ADDED
File without changes
project_tools/numerapi_utils.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numerapi
2
+ from numerapi import utils
3
+ from project_tools import project_config, project_utils
4
+ from typing import List, Dict
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ napi = numerapi.NumerAPI()
9
+
10
+
11
+ # def get_round
12
+
13
+
14
+ # depreciated
15
+ # def get_model_history(model):
16
+ # res = napi.daily_user_performances(model)
17
+ # res = pd.DataFrame.from_dict(res)
18
+ # res['payoutPending'] = res['payoutPending'].astype(np.float64)
19
+ # res['payoutSettled'] = res['payoutSettled'].astype(np.float64)
20
+ # res['stakeValue'] = res['stakeValue'].astype(np.float64)
21
+ # res['deltaRatio'] = res['payoutPending'] / res['stakeValue']
22
+ # res['realised_pl'] = project_utils.series_reverse_cumsum(res['payoutSettled'])
23
+ # res['floating_pl'] = project_utils.series_reverse_cumsum(res['payoutPending']) - res['realised_pl']
24
+ # res['current_stake'] = res['stakeValue'] - res['floating_pl']
25
+ # rename_dict = {'stakeValue':'floating_stake'}
26
+ # res = res.rename(columns=rename_dict)
27
+ # # res['equity'] = res['stakeValue'] + res['floating_pl']
28
+ # # cols = res.columns.tolist()
29
+ # # res = res[['model'] + cols]
30
+ #
31
+ # res['model'] = model
32
+ # cols = ['model', 'date', 'current_stake', 'floating_stake', 'payoutPending', 'floating_pl', 'realised_pl']
33
+ # res = res[cols]
34
+ # return res
35
+
36
+
37
+ def get_portfolio_overview(models, onlylatest=True):
38
+ res_df = []
39
+ for m in models:
40
+ # try:
41
+ print(f'extracting information for model {m}')
42
+ if onlylatest:
43
+ mdf = get_model_history_v3(m).loc[0:0]
44
+ else:
45
+ mdf = get_model_history_v3(m)
46
+ res_df.append(mdf)
47
+ # except:
48
+ # print(f'no information for model {m} is available')
49
+ if len(res_df)>0:
50
+ res_df = pd.concat(res_df, axis=0)
51
+ # res_df['date'] = res_df['date'].dt.date
52
+ if onlylatest:
53
+ return res_df.sort_values(by='floating_pl', ascending=False).reset_index(drop=True)
54
+ else:
55
+ return res_df.reset_index(drop=True)
56
+ else:
57
+ return None
58
+
59
+
60
+
61
+
62
+
63
+
64
+ def get_competitions(tournament=8):
65
+ """Retrieves information about all competitions
66
+ Args:
67
+ tournament (int, optional): ID of the tournament, defaults to 8
68
+ -- DEPRECATED there is only one tournament nowadays
69
+ Returns:
70
+ list of dicts: list of rounds
71
+ Each round's dict contains the following items:
72
+ * datasetId (`str`)
73
+ * number (`int`)
74
+ * openTime (`datetime`)
75
+ * resolveTime (`datetime`)
76
+ * participants (`int`): number of participants
77
+ * prizePoolNmr (`decimal.Decimal`)
78
+ * prizePoolUsd (`decimal.Decimal`)
79
+ * resolvedGeneral (`bool`)
80
+ * resolvedStaking (`bool`)
81
+ * ruleset (`string`)
82
+ Example:
83
+ >>> NumerAPI().get_competitions()
84
+ [
85
+ {'datasetId': '59a70840ca11173c8b2906ac',
86
+ 'number': 71,
87
+ 'openTime': datetime.datetime(2017, 8, 31, 0, 0),
88
+ 'resolveTime': datetime.datetime(2017, 9, 27, 21, 0),
89
+ 'participants': 1287,
90
+ 'prizePoolNmr': Decimal('0.00'),
91
+ 'prizePoolUsd': Decimal('6000.00'),
92
+ 'resolvedGeneral': True,
93
+ 'resolvedStaking': True,
94
+ 'ruleset': 'p_auction'
95
+ },
96
+ ..
97
+ ]
98
+ """
99
+ # self.logger.info("getting rounds...")
100
+
101
+ query = '''
102
+ query($tournament: Int!) {
103
+ rounds(tournament: $tournament) {
104
+ number
105
+ resolveTime
106
+ openTime
107
+ resolvedGeneral
108
+ resolvedStaking
109
+ }
110
+ }
111
+ '''
112
+ arguments = {'tournament': tournament}
113
+ result = napi.raw_query(query, arguments)
114
+ rounds = result['data']['rounds']
115
+ # convert datetime strings to datetime.datetime objects
116
+ for r in rounds:
117
+ utils.replace(r, "openTime", utils.parse_datetime_string)
118
+ utils.replace(r, "resolveTime", utils.parse_datetime_string)
119
+ utils.replace(r, "prizePoolNmr", utils.parse_float_string)
120
+ utils.replace(r, "prizePoolUsd", utils.parse_float_string)
121
+ return rounds
122
+
123
+
124
+ def daily_submissions_performances(username: str) -> List[Dict]:
125
+ """Fetch daily performance of a user's submissions.
126
+ Args:
127
+ username (str)
128
+ Returns:
129
+ list of dicts: list of daily submission performance entries
130
+ For each entry in the list, there is a dict with the following
131
+ content:
132
+ * date (`datetime`)
133
+ * correlation (`float`)
134
+ * roundNumber (`int`)
135
+ * mmc (`float`): metamodel contribution
136
+ * fnc (`float`): feature neutral correlation
137
+ * correlationWithMetamodel (`float`)
138
+ Example:
139
+ >>> api = NumerAPI()
140
+ >>> api.daily_user_performances("uuazed")
141
+ [{'roundNumber': 181,
142
+ 'correlation': -0.011765912,
143
+ 'date': datetime.datetime(2019, 10, 16, 0, 0),
144
+ 'mmc': 0.3,
145
+ 'fnc': 0.1,
146
+ 'correlationWithMetamodel': 0.87},
147
+ ...
148
+ ]
149
+ """
150
+ query = """
151
+ query($username: String!) {
152
+ v2UserProfile(username: $username) {
153
+ dailySubmissionPerformances {
154
+ date
155
+ correlation
156
+ corrPercentile
157
+ roundNumber
158
+ mmc
159
+ mmcPercentile
160
+ fnc
161
+ fncPercentile
162
+ correlationWithMetamodel
163
+ }
164
+ }
165
+ }
166
+ """
167
+ arguments = {'username': username}
168
+ data = napi.raw_query(query, arguments)['data']['v2UserProfile']
169
+ performances = data['dailySubmissionPerformances']
170
+ # convert strings to python objects
171
+ for perf in performances:
172
+ utils.replace(perf, "date", utils.parse_datetime_string)
173
+ # remove useless items
174
+ performances = [p for p in performances
175
+ if any([p['correlation'], p['fnc'], p['mmc']])]
176
+ return performances
177
+
178
+
179
+ def daily_submissions_performances_V3(modelname: str) -> List[Dict]:
180
+ query = """
181
+ query($modelName: String!) {
182
+ v3UserProfile(modelName: $modelName) {
183
+ roundModelPerformances{
184
+ roundNumber
185
+ roundResolveTime
186
+ corr
187
+ corrPercentile
188
+ mmc
189
+ mmcMultiplier
190
+ mmcPercentile
191
+ tc
192
+ tcPercentile
193
+ tcMultiplier
194
+ fncV3
195
+ fncV3Percentile
196
+ corrWMetamodel
197
+ payout
198
+ roundResolved
199
+ roundResolveTime
200
+ corrMultiplier
201
+ mmcMultiplier
202
+ selectedStakeValue
203
+ }
204
+ stakeValue
205
+ nmrStaked
206
+ }
207
+ }
208
+ """
209
+ arguments = {'modelName': modelname}
210
+ data = napi.raw_query(query, arguments)['data']['v3UserProfile']
211
+ performances = data['roundModelPerformances']
212
+ # convert strings to python objects
213
+ for perf in performances:
214
+ utils.replace(perf, "date", utils.parse_datetime_string)
215
+ # remove useless items
216
+ performances = [p for p in performances
217
+ if any([p['corr'], p['tc'], p['mmc']])]
218
+ return performances
219
+
220
+
221
+ def get_lb_models(limit=20000, offset=0):
222
+ query = """
223
+ query($limit: Int, $offset: Int){
224
+ v2Leaderboard(limit:$limit, offset:$offset){
225
+ username
226
+ }
227
+ }
228
+ """
229
+ arguments = {'limit':limit, 'offset':offset}
230
+ data = napi.raw_query(query, arguments)['data']['v2Leaderboard']
231
+ model_list = [i['username'] for i in data]
232
+ return model_list
233
+
234
+
235
+
236
+ def get_round_model_performance(roundNumber: int, model: str):
237
+ query = """
238
+ query($roundNumber: Int!, $username: String!) {
239
+ roundSubmissionPerformance(roundNumber: $roundNumber, username: $username) {
240
+ corrMultiplier
241
+ mmcMultiplier
242
+ roundDailyPerformances{
243
+ correlation
244
+ mmc
245
+ corrPercentile
246
+ mmcPercentile
247
+ payoutPending
248
+ }
249
+ selectedStakeValue
250
+ }
251
+ }
252
+ """
253
+ arguments = {'roundNumber': roundNumber,'username': model}
254
+ data = napi.raw_query(query, arguments)['data']['roundSubmissionPerformance']
255
+ latest_performance = data['roundDailyPerformances'][-1] #[-1] ### issue with order
256
+ res = {}
257
+ res['model'] = model
258
+ res['roundNumber'] = roundNumber
259
+ res['corrMultiplier'] = data['corrMultiplier']
260
+ res['mmcMultiplier'] = data['mmcMultiplier']
261
+ res['selectedStakeValue'] = data['selectedStakeValue']
262
+ for key in latest_performance.keys():
263
+ res[key] = latest_performance[key]
264
+ return res
265
+
266
+
267
+
268
+
269
+ def get_user_profile(username: str) -> List[Dict]:
270
+ """Fetch daily performance of a user's submissions.
271
+ Args:
272
+ username (str)
273
+ Returns:
274
+ list of dicts: list of daily submission performance entries
275
+ For each entry in the list, there is a dict with the following
276
+ content:
277
+ * date (`datetime`)
278
+ * correlation (`float`)
279
+ * roundNumber (`int`)
280
+ * mmc (`float`): metamodel contribution
281
+ * fnc (`float`): feature neutral correlation
282
+ * correlationWithMetamodel (`float`)
283
+ Example:
284
+ >>> api = NumerAPI()
285
+ >>> api.daily_user_performances("uuazed")
286
+ [{'roundNumber': 181,
287
+ 'correlation': -0.011765912,
288
+ 'date': datetime.datetime(2019, 10, 16, 0, 0),
289
+ 'mmc': 0.3,
290
+ 'fnc': 0.1,
291
+ 'correlationWithMetamodel': 0.87},
292
+ ...
293
+ ]
294
+ """
295
+ query = """
296
+ query($username: String!) {
297
+ v2UserProfile(username: $username) {
298
+ dailySubmissionPerformances {
299
+ date
300
+ correlation
301
+ corrPercentile
302
+ roundNumber
303
+ mmc
304
+ mmcPercentile
305
+ fnc
306
+ fncPercentile
307
+ correlationWithMetamodel
308
+ }
309
+ }
310
+ }
311
+ """
312
+ arguments = {'username': username}
313
+ data = napi.raw_query(query, arguments)['data']#['v2UserProfile']
314
+ # performances = data['dailySubmissionPerformances']
315
+ # # convert strings to python objects
316
+ # for perf in performances:
317
+ # utils.replace(perf, "date", utils.parse_datetime_string)
318
+ # # remove useless items
319
+ # performances = [p for p in performances
320
+ # if any([p['correlation'], p['fnc'], p['mmc']])]
321
+ return data
322
+
323
+
324
+ def download_dataset(filename: str, dest_path: str = None,
325
+ round_num: int = None) -> None:
326
+ """ Download specified file for the current active round.
327
+
328
+ Args:
329
+ filename (str): file to be downloaded
330
+ dest_path (str, optional): complate path where the file should be
331
+ stored, defaults to the same name as the source file
332
+ round_num (int, optional): tournament round you are interested in.
333
+ defaults to the current round
334
+ tournament (int, optional): ID of the tournament, defaults to 8
335
+
336
+ Example:
337
+ >>> filenames = NumerAPI().list_datasets()
338
+ >>> NumerAPI().download_dataset(filenames[0]}")
339
+ """
340
+ if dest_path is None:
341
+ dest_path = filename
342
+
343
+ query = """
344
+ query ($filename: String!
345
+ $round: Int) {
346
+ dataset(filename: $filename
347
+ round: $round)
348
+ }
349
+ """
350
+ args = {'filename': filename, "round": round_num}
351
+
352
+ dataset_url = napi.raw_query(query, args)['data']['dataset']
353
+ utils.download_file(dataset_url, dest_path, show_progress_bars=True)
354
+
355
+
356
+
357
+ # function using V3UserProfile
358
+
359
+ def model_payout_history(model):
360
+ napi = numerapi.NumerAPI()
361
+ query = """
362
+ query($model: String!) {
363
+ v3UserProfile(modelName: $model) {
364
+ roundModelPerformances{
365
+ payout
366
+ roundNumber
367
+ roundResolved
368
+ roundResolveTime
369
+ corrMultiplier
370
+ mmcMultiplier
371
+ selectedStakeValue
372
+ }
373
+ stakeValue
374
+ nmrStaked
375
+ }
376
+ }
377
+ """
378
+ arguments = {'model': model}
379
+ payout_info = napi.raw_query(query, arguments)['data']['v3UserProfile']['roundModelPerformances']
380
+ payout_info = pd.DataFrame.from_dict(payout_info)
381
+ payout_info = payout_info[~pd.isnull(payout_info['payout'])].reset_index(drop=True)
382
+ return payout_info
383
+
384
+
385
+ def get_model_history_v3(model):
386
+ res = model_payout_history(model)
387
+ res = pd.DataFrame.from_dict(res)
388
+ res['payout'] = res['payout'].astype(np.float64)
389
+ res['current_stake'] = res['selectedStakeValue'].astype(np.float64)
390
+ res['payout_cumsum'] = project_utils.series_reverse_cumsum(res['payout'])
391
+ res['date'] = pd.to_datetime(res['roundResolveTime']).dt.date
392
+
393
+ res['realised_pl'] = res['payout_cumsum']
394
+ latest_realised_pl = res[res['roundResolved'] == True]['payout_cumsum'].values[0]
395
+ res.loc[res['roundResolved'] == False, 'realised_pl'] = latest_realised_pl
396
+
397
+ res['floating_pl'] = 0
398
+ payoutPending_values = res[res['roundResolved'] == False]['payout'].values
399
+ payoutPending_cumsum = payoutPending_values[::-1].cumsum()[::-1]
400
+ res.loc[res['roundResolved'] == False, 'floating_pl'] = payoutPending_cumsum
401
+
402
+ res['model'] = model
403
+ # res['floating_pl'] = res['current_stake'] + res['payoutPending']
404
+ res['floating_stake'] = res['current_stake'] + res['floating_pl']
405
+ cols = ['model', 'date', 'current_stake', 'floating_stake', 'payout', 'floating_pl', 'realised_pl', 'roundResolved',
406
+ 'roundNumber']
407
+ res = res[cols]
408
+ return res
409
+
410
+
411
+
412
+
413
+
414
+
project_tools/project_config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.dirname(os.getcwd()))
4
+
5
+ DATETIME_FORMAT1 = '%Y%m%d%H%M'
6
+ DATETIME_FORMAT2 = '%Y/%m/%d %H:%M'
7
+ DATETIME_FORMAT3 = '%Y-%m-%d'
8
+ SAVE_LOCAL_COPY = True
9
+
10
+ BENCHMARK_MODELS = ['integration_test', 'integration_test_7'] #'budbot_7'] #'integration_test_7'
11
+ FEATURE_PATH = './feature_data/'
12
+ MODEL_ROUND_RESULT_FILE = './feature_data/model_round_result.pkl'
13
+ MODEL_DAILY_RESULT_FILE = './feature_data/model_daily_result.pkl'
14
+ NUMERATI_FILE = './feature_data/numerati_data.pkl'
15
+
16
+ NUMERATI_URL = 'https://raw.githubusercontent.com/woobe/numerati/master/data.csv'
17
+
18
+
19
+
20
+
21
+
project_tools/project_utils.py ADDED
@@ -0,0 +1,815 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ import pickle
5
+ import time
6
+ from contextlib import contextmanager
7
+ from importlib import reload
8
+ import re
9
+ from project_tools import project_config, project_utils, numerapi_utils
10
+ import glob
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+ from random import randint, random
14
+ import itertools
15
+ import scipy
16
+ from scipy.stats import ks_2samp
17
+ from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, mean_squared_error
18
+ from sklearn.preprocessing import MinMaxScaler, StandardScaler
19
+ from sklearn.pipeline import make_pipeline
20
+ from sklearn import linear_model
21
+ import datetime
22
+ import json
23
+ from collections import OrderedDict
24
+ from os import listdir
25
+ from os.path import isfile, join, isdir
26
+ import glob
27
+ import numerapi
28
+ import itertools
29
+ import io
30
+ import requests
31
+ from pathlib import Path
32
+ from scipy.stats.mstats import gmean
33
+ from typing import List, Dict
34
+
35
+
36
+ napi = numerapi.NumerAPI() #verbosity="info")
37
+
38
+
39
+ def get_time_string():
40
+ """
41
+ Generate a time string representation of the time of call of this function.
42
+ :param None
43
+ :return: a string that represent the time of the functional call.
44
+ """
45
+ now = datetime.datetime.now()
46
+ now = str(now.strftime('%Y%m%d%H%M'))
47
+ return now
48
+
49
+
50
+ def reload_project():
51
+ """
52
+ utility function used during experimentation to reload various model when required, useful for quick experiment iteration
53
+ :return: None
54
+ """
55
+ reload(project_config)
56
+ reload(project_utils)
57
+ reload(numerapi_utils)
58
+
59
+ @contextmanager
60
+ def timer(name):
61
+ """
62
+ utility timer function to check how long a piece of code might take to run.
63
+ :param name: name of the code fragment to be timed
64
+ :yield: time taken for the code to run
65
+ """
66
+ t0 = time.time()
67
+ print('[%s] in progress' % name)
68
+ yield
69
+ print('[%s] done in %.6f s' %(name, time.time() - t0))
70
+
71
+
72
+
73
+ def load_data(pickle_file):
74
+ """
75
+ load pickle data from file
76
+ :param pickle_file: path of pickle data
77
+ :return: data stored in pickle file
78
+ """
79
+ load_file = open(pickle_file, 'rb')
80
+ data = pickle.load(load_file)
81
+ return data
82
+
83
+
84
+ def pickle_data(path, data, protocol=-1, timestamp=False, verbose=True):
85
+ """
86
+ Pickle data to specified file
87
+ :param path: full path of file where data will be pickled to
88
+ :param data: data to be pickled
89
+ :param protocol: pickle protocol, -1 indicate to use the latest protocol
90
+ :return: None
91
+ """
92
+ file = path
93
+ if timestamp:
94
+ base_file = os.path.splitext(file)[0]
95
+ time_str = '_' + get_time_string()
96
+ ext = os.path.splitext(os.path.basename(file))[1]
97
+ file = base_file + time_str + ext
98
+
99
+ if verbose:
100
+ print('creating file %s' % file)
101
+
102
+ save_file = open(file, 'wb')
103
+ pickle.dump(data, save_file, protocol=protocol)
104
+ save_file.close()
105
+
106
+
107
+ def save_json(path, data, timestamp=False, verbose=True, indent=2):
108
+ """
109
+ Save data to Json format
110
+ :param path: full path of file where data will be pickled to
111
+ :param data: data to be pickled
112
+ :param timestamp: if true, the timestamp will be saved as part of the file name
113
+ :param verbose: if true, print information about file creation
114
+ :param indent: specify the width of the indent in the resulted Json file
115
+ :return: None
116
+ """
117
+ file = path
118
+ if timestamp:
119
+ base_file = os.path.splitext(file)[0]
120
+ time_str = '_' + get_time_string()
121
+ ext = os.path.splitext(os.path.basename(file))[1]
122
+ file = base_file + time_str + ext
123
+ if verbose:
124
+ print('creating file %s' % file)
125
+ outfile = open(file, 'w')
126
+ json.dump(data, outfile, indent=indent)
127
+ outfile.close()
128
+
129
+
130
+ def load_json(json_file):
131
+ """
132
+ load data from Json file
133
+ :param json_file: path of json file
134
+ :return: data stored in json file as python dictionary
135
+ """
136
+ load_file = open(json_file)
137
+ data = json.load(load_file)
138
+ load_file.close()
139
+ return data
140
+
141
+
142
+ def create_folder(path):
143
+ Path(path).mkdir(parents=True, exist_ok=True)
144
+
145
+
146
+
147
+ def glob_folder_filelist(path, file_type='', recursive=True):
148
+ """
149
+ utility function that walk through a given directory, and return list of files in the directory
150
+ :param path: the path of the directory
151
+ :param file_type: if not '', this function would only consider the file type specified by this parameter
152
+ :param recursive: if True, perform directory walk-fhrough recursively
153
+ :return absfile: a list containing absolute path of each file in the directory
154
+ :return base_files: a list containing base name of each file in the directory
155
+ """
156
+ if path[-1] != '/':
157
+ path = path +'/'
158
+ abs_files = []
159
+ base_files = []
160
+ patrn = '**' if recursive else '*'
161
+ glob_path = path + patrn
162
+ matches = glob.glob(glob_path, recursive=recursive)
163
+ for f in matches:
164
+ if os.path.isfile(f):
165
+ include = True
166
+ if len(file_type)>0:
167
+ ext = os.path.splitext(f)[1]
168
+ if ext[1:] != file_type:
169
+ include = False
170
+ if include:
171
+ abs_files.append(f)
172
+ base_files.append(os.path.basename(f))
173
+ return abs_files, base_files
174
+
175
+
176
+ def dir_compare(pathl, pathr):
177
+ files_pathl = set([f for f in listdir(pathl) if isfile(join(pathl, f))])
178
+ files_pathr = set([f for f in listdir(pathr) if isfile(join(pathr, f))])
179
+ return list(files_pathl-files_pathr), list(files_pathr-files_pathl)
180
+
181
+
182
+
183
+
184
+ def lr_dir_sync(pathl, pathr):
185
+ files_lrddiff, files_rldiff = project_utils.dir_compare(pathl, pathr)
186
+ for f in files_lrddiff:
187
+ scr = pathl + f
188
+ dst = pathr + f
189
+ print('copying file %s' % scr)
190
+ copyfile(scr, dst)
191
+
192
+
193
+
194
+ def copy_file_with_time(src_file, dst_file_name, des_path):
195
+ basename = os.path.splitext(os.path.basename(dst_file_name))[0]
196
+ ext_name = os.path.splitext(os.path.basename(dst_file_name))[1]
197
+ timestr = get_time_string()
198
+ des_name = '%s%s_%s%s' % (des_path, basename, timestr, ext_name)
199
+ # print(des_name)
200
+ copyfile(src_file, des_name)
201
+
202
+
203
+
204
+
205
+
206
+ def find_filesfromfolder(target_dir, containtext):
207
+ absnames, basenames = glob_folder_filelist(target_dir)
208
+ result_filelist = []
209
+ for absname, basename in zip(absnames, basenames):
210
+ if containtext in basename:
211
+ result_filelist.append(absname)
212
+ # result_filelist = [f for f in total_filelist if containtext in f]
213
+ return result_filelist
214
+
215
+
216
+ def cp_files_with_prefix(src_path, dst_path, prefix, ext):
217
+ abs_file_list, base_file_list = get_folder_filelist(src_path, file_type=ext)
218
+ # print(abs_file_list)
219
+ for src_file, base_file in zip(abs_file_list, base_file_list):
220
+ dst_file = dst_path + prefix + base_file
221
+ copyfile(src_file, dst_file)
222
+ return None
223
+
224
+
225
+
226
+ def mv_files_with_prefix(src_path, dst_path, prefix, ext):
227
+ abs_file_list, base_file_list = get_folder_filelist(src_path, file_type=ext)
228
+ # print(abs_file_list)
229
+ for src_file, base_file in zip(abs_file_list, base_file_list):
230
+ dst_file = dst_path + prefix + base_file
231
+ move(src_file, dst_file)
232
+ return None
233
+
234
+
235
+
236
+ def empty_folder(path):
237
+ if path[-1]!='*':
238
+ path = path + '*'
239
+ files = glob.glob(path)
240
+ for f in files:
241
+ os.remove(f)
242
+
243
+
244
+ def rescale(n, range1, range2):
245
+ if n>range1[1]: #or n<range1[0]:
246
+ n=range1[1]
247
+ if n<range1[0]:
248
+ n=range1[0]
249
+ delta1 = range1[1] - range1[0]
250
+ delta2 = range2[1] - range2[0]
251
+ return (delta2 * (n - range1[0]) / delta1) + range2[0]
252
+
253
+
254
+
255
+ def rmse(y_true, y_pred):
256
+ """
257
+ RMSE (Root Mean Square Error) evaluation function
258
+ :param y_true: label values
259
+ :param y_pred: prediction values
260
+ :return: RMSE value of the input prediction values, evaluated against the input label values
261
+ """
262
+ return np.sqrt(mean_squared_error(y_true, y_pred))
263
+
264
+
265
+
266
+
267
+ def str2date(date_str, dateformat='%Y-%m-%d'):
268
+ """
269
+ convert an input string in specified format into datetime format
270
+ :param date_str: the input string with certain specified format
271
+ :param dateformat: the format of the string which is used by the strptime function to do the type converson
272
+ :return dt_value: the datetime value that is corresponding to the input string and the specified format
273
+ """
274
+ dt_value = datetime.datetime.strptime(date_str, dateformat)
275
+ return dt_value
276
+
277
+
278
+ def isnotebook():
279
+ """
280
+ Determine if the current python file is a jupyter notebook (.ipynb) or a python script (.py)
281
+ :return: return True if the the current python file is a jupyter notebook, otherwise return False
282
+ """
283
+ try:
284
+ shell = get_ipython().__class__.__name__
285
+ if shell == 'ZMQInteractiveShell':
286
+ return True # Jupyter notebook
287
+ elif shell == 'TerminalInteractiveShell':
288
+ return False # Terminal running IPython
289
+ else:
290
+ return False # Other type (?)
291
+ except NameError:
292
+ return False
293
+
294
+
295
+
296
+ def list_intersection(left, right):
297
+ """
298
+ take two list as input, conver them into sets, calculate the intersection of the two sets, and return this as a list
299
+ :param left: the first input list
300
+ :param right: the second input list
301
+ :return: the intersection set of elements for both input list, as a list
302
+ """
303
+ left_set = set(left)
304
+ right_set = set(right)
305
+ return list(left_set.intersection(right_set))
306
+
307
+
308
+ def list_union(left, right):
309
+ """
310
+ take two list as input, conver them into sets, calculate the union of the two sets, and return this as a list
311
+ :param left: the first input list
312
+ :param right: the second input list
313
+ :return: the union set of elements for both input list, as a list
314
+ """
315
+ left_set = set(left)
316
+ right_set = set(right)
317
+ return list(left_set.union(right_set))
318
+
319
+
320
+ def list_difference(left, right):
321
+ """
322
+ take two list as input, conver them into sets, calculate the difference of the first set to the second set, and return this as a list
323
+ :param left: the first input list
324
+ :param right: the second input list
325
+ :return: the result of difference set operation on elements for both input list, as a list
326
+ """
327
+ left_set = set(left)
328
+ right_set = set(right)
329
+ return list(left_set.difference(right_set))
330
+
331
+
332
+ def is_listelements_identical(left, right):
333
+ equal_length = (len(left)==len(right))
334
+ zero_diff = (len(list_difference(left,right))==0)
335
+ return equal_length & zero_diff
336
+
337
+
338
+
339
+
340
+ def np_corr(a, b):
341
+ """
342
+ take two numpy arrays, and compute their correlation
343
+ :param a: the first numpy array input
344
+ :param b: the second numpy array input
345
+ :return: the correlation between the two input arrays
346
+ """
347
+ return pd.Series(a).corr(pd.Series(b))
348
+
349
+
350
+
351
+ def list_sort_values(a, ascending=True):
352
+ """
353
+ sort the value of a list in specified order
354
+ :param a: the input list
355
+ :param ascending: specified if the sorting is to be done in ascending or descending order
356
+ :return: the input list sorted in the specified order
357
+ """
358
+ return pd.Series(a).sort_values(ascending=ascending).tolist()
359
+
360
+
361
+ def get_rank(data):
362
+ """
363
+ convert the values of a list or array into ranked percentage values
364
+ :param data: the input data in the form of a list or an array
365
+ :return: the return ranked percentage values in numpy array
366
+ """
367
+ ranks = pd.Series(data).rank(pct=True).values
368
+ return ranks
369
+
370
+
371
+
372
+ def plot_feature_corr(df, features, figsize=(10,10), vmin=-1.0):
373
+ """
374
+ plot the pair-wise correlation matrix for specified features in a dataframe
375
+ :param df: the input dataframe
376
+ :param features: the list of features for which correlation matrix will be plotted
377
+ :param figsize: the size of the displayed figure
378
+ :param vmin: the minimum value of the correlation to be included in the plotting
379
+ :return: the pair-wise correlation values in the form of pandas dataframe, the figure will be plotted during the operation of this function.
380
+ """
381
+ val_corr = df[features].corr().fillna(0)
382
+ f, ax = plt.subplots(figsize=figsize)
383
+ sns.heatmap(val_corr, vmin=vmin, square=True)
384
+ return val_corr
385
+
386
+
387
+ def decision_to_prob(data):
388
+ """
389
+ convert output value of a sklearn classifier (i.e. ridge classifier) decision function into probability
390
+ :param data: output value of decision function in the form of a numpy array
391
+ :return: value of probability in the form of a numpy array
392
+ """
393
+ prob = np.exp(data) / np.sum(np.exp(data))
394
+ return prob
395
+
396
+
397
+ def np_describe(a):
398
+ """
399
+ provide overall statistic description of an input numpy value using the Describe method of Pandas Series
400
+ :param a: the input numpy array
401
+ :return: overall statistic description
402
+ """
403
+ return pd.Series(a.flatten()).describe()
404
+
405
+
406
+ def ks_2samp_selection(train_df, test_df, pval=0.1):
407
+ """
408
+ use scipy ks_2samp function to select features that are statistically similar between the input train and test dataframe.
409
+ :param train_df: the input train dataframe
410
+ :param test_df: the input test dataframe
411
+ :param pval: the p value threshold use to decide which features to be selected. Only features with value higher than the specified p value will be selected
412
+ :return train_df: the return train dataframe with selected features
413
+ :return test_df: the return test dataframe with selected features
414
+ """
415
+ list_p_value = []
416
+ for i in train_df.columns.tolist():
417
+ list_p_value.append(ks_2samp(train_df[i], test_df[i])[1])
418
+ Se = pd.Series(list_p_value, index=train_df.columns.tolist()).sort_values()
419
+ list_discarded = list(Se[Se < pval].index)
420
+ train_df = train_df.drop(columns=list_discarded)
421
+ test_df = test_df.drop(columns=list_discarded)
422
+ return train_df, test_df
423
+
424
+
425
+
426
+ def df_balance_sampling(df, class_feature, minor_class=1, sample_ratio=1):
427
+ """
428
+ :param df:
429
+ :param class_feature:
430
+ :param minor_class:
431
+ :param sample_ratio:
432
+ :return:
433
+ """
434
+ minor_df = df[df[class_feature] == minor_class]
435
+ major_df = df[df[class_feature] == (1 - minor_class)].sample(sample_ratio * len(minor_df))
436
+
437
+ res_df = minor_df.append(major_df)
438
+ res_df = res_df.sample(len(res_df)).reset_index(drop=True)
439
+ return res_df
440
+
441
+
442
+ def prob2acc(label, probs, p=0.5):
443
+ """
444
+ calculate accuracy score for probability predictions with given threshold, as part of the process, the input probability predictions will be converted into discrete binary predictions
445
+ :param label: labels used to evaluate accuracy score
446
+ :param probs: probability predictions for which accuracy score will be calculated
447
+ :param p: the threshold to be used for convert probabilites into discrete binary values 0 and 1
448
+ :return acc: the computed accuracy score
449
+ :return preds: predictions in discrete binary value
450
+ """
451
+
452
+ preds = (probs >= p).astype(np.uint8)
453
+ acc = accuracy_score(label, preds)
454
+ return acc, preds
455
+
456
+
457
+
458
+ def np_pearson(t,p):
459
+ vt = t - t.mean()
460
+ vp = p - p.mean()
461
+ top = np.sum(vt*vp)
462
+ bottom = np.sqrt(np.sum(vt**2)) * np.sqrt(np.sum(vp**2))
463
+ res = top/bottom
464
+ return res
465
+
466
+
467
+ def df_get_features_with_str(df, ptrn):
468
+ """
469
+ extract list of feature names from a data frame that contain the specified regular expression pattern
470
+ :param df: the input dataframe of which features name to be analysed
471
+ :param ptrn: the specified regular expression pattern
472
+ :return: list of feature names that contained the specified regular expression
473
+ """
474
+ return [col for col in df.columns.tolist() if len(re.findall(ptrn, col)) > 0]
475
+
476
+
477
+ def df_fillna_with_other(df, src_feature, dst_feature):
478
+ """
479
+ fill the NA values of a specified feature in a dataframe with values of another feature from the same row.
480
+ :param df: the input dataframe
481
+ :param src_feature: the specified feature of which NA value will be filled
482
+ :param dst_feature: the feature of which values will be used
483
+ :return: a dataframe with the specified feature's NA value being filled by values from the "dst_feature"
484
+ """
485
+ src_vals = df[src_feature].values
486
+ dst_vals = df[dst_feature].values
487
+ argwhere_nan = np.argwhere(np.isnan(dst_vals)).flatten()
488
+ dst_vals[argwhere_nan] = src_vals[argwhere_nan]
489
+ df[dst_feature] = dst_vals
490
+ return df
491
+
492
+
493
+
494
+ def plot_prediction_prob(y_pred_prob):
495
+ """
496
+ plot probability prediction values using histrogram
497
+ :param y_pred_prob: the probability prediction values to be plotted
498
+ :return: None, the plot will be plotted during the operation of the function.
499
+ """
500
+ prob_series = pd.Series(data=y_pred_prob)
501
+ prob_series.name = 'prediction probability'
502
+ prob_series.plot(kind='hist', figsize=(15, 5), bins=50)
503
+ plt.show()
504
+ print(prob_series.describe())
505
+
506
+
507
+
508
+
509
+
510
+ def df_traintest_split(df, split_var, seed=None, train_ratio=0.75):
511
+ """
512
+ perform train test split on a specified feature on a given dataframe wwith specified train ratio. Unique value of the specified feature will only present on either the resulted train or the test dataframe
513
+ :param df: the input dataframe to be split
514
+ :param split_var: the feature to be used as unique value to perform the split
515
+ :param seed: the random used to facilitate the train test split
516
+ :param train_ratio: the ratio of data to be split into the resulted train dataframe.
517
+ :return train_df: the resulted train dataframe after the split
518
+ :return test_df: the resulted test dataframe after the split
519
+ """
520
+ sv_list = df[split_var].unique().tolist()
521
+ train_length = int(len(sv_list) * train_ratio)
522
+ train_siv_list = pd.Series(df[split_var].unique()).sample(train_length, random_state=seed)
523
+ train_idx = df.loc[df[split_var].isin(train_siv_list)].index.values
524
+ test_idx = df.iloc[df.index.difference(train_idx)].index.values
525
+ train_df = df.loc[train_idx].copy().reset_index(drop=True)
526
+ test_df = df.loc[test_idx].copy().reset_index(drop=True)
527
+ return train_df, test_df
528
+
529
+
530
+
531
+ # https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
532
+ def reduce_mem_usage(df, verbose=True, exceiptions=[]):
533
+ """ iterate through all the columns of a dataframe and modify the data type
534
+ to reduce memory usage.
535
+ """
536
+ np_input = False
537
+ if isinstance(df, np.ndarray):
538
+ np_input = True
539
+ df = pd.DataFrame(data=df)
540
+
541
+ start_mem = df.memory_usage().sum() / 1024 ** 2
542
+ col_id = 0
543
+ print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
544
+ for col in df.columns:
545
+ if verbose: print('doing %d: %s' % (col_id, col))
546
+ col_type = df[col].dtype
547
+ try:
548
+ if (col_type != object) & (col not in exceiptions):
549
+ c_min = df[col].min()
550
+ c_max = df[col].max()
551
+ if str(col_type)[:3] == 'int':
552
+ if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
553
+ df[col] = df[col].astype(np.int8)
554
+ elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
555
+ df[col] = df[col].astype(np.int16)
556
+ elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
557
+ df[col] = df[col].astype(np.int32)
558
+ elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
559
+ df[col] = df[col].astype(np.int64)
560
+ else:
561
+ if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
562
+ # df[col] = df[col].astype(np.float16)
563
+ # elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
564
+ df[col] = df[col].astype(np.float32)
565
+ else:
566
+ df[col] = df[col].astype(np.float64)
567
+ # else:
568
+ # df[col] = df[col].astype('category')
569
+ # pass
570
+ except:
571
+ pass
572
+ col_id += 1
573
+ end_mem = df.memory_usage().sum() / 1024 ** 2
574
+ print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
575
+ print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
576
+
577
+ if np_input:
578
+ return df.values
579
+ else:
580
+ return df
581
+
582
+
583
+
584
+ def get_xgb_featimp(model):
585
+ imp_type = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
586
+ imp_dict = {}
587
+ try:
588
+ bst = model.get_booster()
589
+ except:
590
+ bst = model
591
+ feature_names = bst.feature_names
592
+ for impt in imp_type:
593
+ imp_dict[impt] = []
594
+ scores = bst.get_score(importance_type=impt)
595
+ for feature in feature_names:
596
+ if feature in scores.keys():
597
+ imp_dict[impt].append(scores[feature])
598
+ else:
599
+ imp_dict[impt].append(np.nan)
600
+ imp_df = pd.DataFrame(index=bst.feature_names, data=imp_dict)
601
+ return imp_df
602
+
603
+
604
+ def get_df_rankavg(df):
605
+ idx = df.index
606
+ cols = df.columns.tolist()
607
+ rankavg_dict = {}
608
+ for col in cols:
609
+ rankavg_dict[col]=df[col].rank(pct=True).tolist()
610
+ rankavg_df = pd.DataFrame(index=idx, columns=cols, data=rankavg_dict)
611
+ rankavg_df['rankavg'] = rankavg_df.mean(axis=1)
612
+ return rankavg_df.sort_values(by='rankavg', ascending=False)
613
+
614
+
615
+ def get_list_gmean(lists):
616
+ out = np.zeros((len(lists[0]), len(lists)))
617
+ for i in range(0, len(lists)):
618
+ out[:,i] = lists[i]
619
+ gmean_out = gmean(out, axis=1)
620
+ return gmean_out
621
+
622
+
623
+
624
+ def generate_nwise_combination(items, n=2):
625
+ return list(itertools.combinations(items, n))
626
+
627
+
628
+ def pairwise_feature_generation(df, feature_list, operator='addition', verbose=True):
629
+ feats_pair = generate_nwise_combination(feature_list, 2)
630
+ result_df = pd.DataFrame()
631
+ for pair in feats_pair:
632
+ if verbose:
633
+ print('generating %s of %s and %s' % (operator, pair[0], pair[1]))
634
+ if operator == 'addition':
635
+ feat_name = pair[0] + '_add_' + pair[1]
636
+ result_df[feat_name] = df[pair[0]] + df[pair[1]]
637
+ elif operator == 'multiplication':
638
+ feat_name = pair[0] + '_mulp_' + pair[1]
639
+ result_df[feat_name] = df[pair[0]] * df[pair[1]]
640
+ elif operator == 'division':
641
+ feat_name = pair[0] + '_div_' + pair[1]
642
+ result_df[feat_name] = df[pair[0]] / df[pair[1]]
643
+ return result_df
644
+
645
+
646
+ def try_divide(x, y, val=0.0):
647
+ """
648
+ try to perform division between two number, and return a default value if division by zero is detected
649
+ :param x: the number to be used as dividend
650
+ :param y: the number to be used as divisor
651
+ :param val: the default output value
652
+ :return: the output value, the default value of val will be returned if division by zero is detected
653
+ """
654
+ if y != 0.0:
655
+ val = float(x) / y
656
+ return val
657
+
658
+
659
+ def series_reverse_cumsum(a):
660
+ return a.fillna(0).values[::-1].cumsum()[::-1]
661
+
662
+
663
+ def get_array_sharpe(values):
664
+ return values.mean()/values.std()
665
+
666
+
667
+ #### NumerDash specific functions ###
668
+
669
+ def calculate_rounddailysharpe_dashboard(df, lastround, earliest_round, score='corr'):
670
+ if score=='corr':
671
+ target = 'corr_sharpe'
672
+ elif score == 'corr_pct':
673
+ target = 'corr_pct_sharpe'
674
+ elif score=='mmc':
675
+ target = 'mmc_sharpe'
676
+ elif score=='mmc_pct':
677
+ target = 'mmc_pct_sharpe'
678
+ elif score=='corrmmc':
679
+ target = 'corrmmc_sharpe'
680
+ elif score=='corr2mmc':
681
+ target = 'corr2mmc_sharpe'
682
+ elif score=='cmavg_pct':
683
+ target = 'cmavgpct_sharpe'
684
+ elif score=='c2mavg_pct':
685
+ target = 'c2mavcpct_sharpe'
686
+
687
+ mean_feat = 'avg_sharpe'
688
+ sos_feat = 'sos'
689
+ df = df[(df['roundNumber'] >= earliest_round) & (df['roundNumber'] <= lastround)]
690
+ res = df.groupby(['model', 'roundNumber', 'group'])[score].apply(
691
+ lambda x: get_array_sharpe(x)).reset_index(drop=False)
692
+ res = res.rename(columns={score: target}).sort_values('roundNumber', ascending=False)
693
+ res = res.pivot(index=['model', 'group'], columns='roundNumber', values=target)
694
+ res.columns.name = ''
695
+ cols = [i for i in res.columns[::-1]]
696
+ res = res[cols]
697
+ res[mean_feat] = res[cols].mean(axis=1)
698
+ res[sos_feat] = res[cols].apply(lambda x: get_array_sharpe(x), axis=1)
699
+ res = res.drop_duplicates(keep='first').sort_values(by=sos_feat, ascending=False)
700
+ res.reset_index(drop=False, inplace=True)
701
+ return res[['model', 'group', sos_feat, mean_feat]+cols]
702
+
703
+
704
+
705
+ def groupby_agg_execution(agg_recipies, df, verbose=True):
706
+ result_dfs = dict()
707
+ for groupby_cols, features, aggs in agg_recipies:
708
+ group_object = df.groupby(groupby_cols)
709
+ groupby_key = '_'.join(groupby_cols)
710
+ if groupby_key not in list(result_dfs.keys()):
711
+ result_dfs[groupby_key] = pd.DataFrame()
712
+ for feature in features:
713
+ rename_col = feature
714
+ for agg in aggs:
715
+ if isinstance(agg, dict):
716
+ agg_name = list(agg.keys())[0]
717
+ agg_func = agg[agg_name]
718
+ else:
719
+ agg_name = agg
720
+ agg_func = agg
721
+ if agg_name=='count':
722
+ groupby_aggregate_name = '{}_{}'.format(groupby_key, agg_name)
723
+ else:
724
+ groupby_aggregate_name = '{}_{}_{}'.format(groupby_key, feature, agg_name)
725
+ verbose and print(f'generating statistic {groupby_aggregate_name}')
726
+ groupby_res_df = group_object[feature].agg(agg_func).reset_index(drop=False)
727
+ groupby_res_df = groupby_res_df.rename(columns={rename_col: groupby_aggregate_name})
728
+ if len(result_dfs[groupby_key]) == 0:
729
+ result_dfs[groupby_key] = groupby_res_df
730
+ else:
731
+ result_dfs[groupby_key][groupby_aggregate_name] = groupby_res_df[groupby_aggregate_name]
732
+ return result_dfs
733
+
734
+
735
+ def get_latest_round_id():
736
+ try:
737
+ all_competitions = numerapi_utils.get_competitions()
738
+ latest_comp_id = all_competitions[0]['number']
739
+ except:
740
+ print('calling numerai API unsuccessulf')
741
+ # local_data = load_data(project_config.DASHBOARD_MODEL_RESULT_FILE)
742
+ # latest_comp_id = local_data['roundNumber'].max()
743
+ latest_comp_id = 0
744
+ return int(latest_comp_id)
745
+
746
+ # except:
747
+
748
+ latest_round = get_latest_round_id()
749
+
750
+
751
+
752
+
753
+ def update_numerati_data(url=project_config.NUMERATI_URL, save_path=project_config.FEATURE_PATH):
754
+ content = requests.get(url).content
755
+ data = pd.read_csv(io.StringIO(content.decode('utf-8')))
756
+ save_file = os.path.join(save_path, 'numerati_data.pkl')
757
+ pickle_data(save_file, data)
758
+ return data
759
+
760
+
761
+
762
+
763
+ def get_model_group(model_name):
764
+ cat_name = 'other'
765
+ if model_name in project_config.MODEL_NAMES+project_config.NEW_MODEL_NAMES:
766
+ cat_name = 'yx'
767
+ elif model_name in project_config.TOP_LB:
768
+ cat_name = 'top_corr'
769
+ elif model_name in project_config.IAAI_MODELS:
770
+ cat_name = 'iaai'
771
+ elif model_name in project_config.ARBITRAGE_MODELS:
772
+ cat_name = 'arbitrage'
773
+ elif model_name in project_config.MCV_MODELS:
774
+ cat_name = 'mcv'
775
+ # elif model_name in project_config.MM_MODELS:
776
+ # cat_name = 'mm'
777
+ elif model_name in project_config.BENCHMARK_MODELS:
778
+ cat_name = 'benchmark'
779
+ elif model_name in project_config.TP3M:
780
+ cat_name = 'top_3m'
781
+ elif model_name in project_config.TP1Y:
782
+ cat_name = 'top_1y'
783
+ return cat_name
784
+
785
+
786
+ def get_dashboard_data_status():
787
+ dashboard_data_tstr = 'NA'
788
+ nmtd_tstr = 'NA'
789
+ try:
790
+ dashboard_data_t = datetime.datetime.utcfromtimestamp(os.path.getctime(project_config.DASHBOARD_MODEL_RESULT_FILE))
791
+ dashboard_data_tstr = dashboard_data_t.strftime(project_config.DATETIME_FORMAT2)
792
+ except Exception as e:
793
+ print(e)
794
+ pass
795
+ try:
796
+ nmtd_t = datetime.datetime.utcfromtimestamp(os.path.getctime(project_config.NUMERATI_FILE))
797
+ nmtd_tstr = nmtd_t.strftime(project_config.DATETIME_FORMAT2)
798
+ except Exception as e:
799
+ print(e)
800
+ pass
801
+ return dashboard_data_tstr, nmtd_tstr
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ matplotlib==3.2.2
2
+ numerapi==2.9.0
3
+ numpy==1.20.0
4
+ pandas==1.3.2
5
+ Pillow==9.1.0
6
+ plotly==4.14.3
7
+ requests==2.25.1
8
+ scikit_learn==1.0.2
9
+ scipy==1.6.0
10
+ seaborn==0.11.1