zaborshicov commited on
Commit
b1480dd
·
verified ·
1 Parent(s): ab33070

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +148 -0
utils.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ from sklearn.model_selection import ShuffleSplit, train_test_split, GridSearchCV
5
+ from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
6
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
7
+
8
+
9
+ SEED = 8642
10
+ np.random.seed(SEED)
11
+
12
+ config = {'min_xval':55.55, 'max_xval':55.95, 'min_yval':37.3, 'max_yval':37.9, 'x_ngroups': 8, 'y_ngroups': 8}
13
+
14
+ def split_on_intervals(min_val, max_val, n):
15
+ step = (max_val - min_val)/n
16
+ intervals = [min_val+(step*x) for x in range(n+1)]
17
+ return intervals
18
+
19
+
20
+ def create_groups(x_intervals, y_intervals):
21
+ groups = {}
22
+ x_intervals = np.concatenate([[-np.inf], x_intervals, [np.inf]])
23
+ y_intervals = np.concatenate([[-np.inf], y_intervals, [np.inf]])
24
+ for x_i in range(len(x_intervals)-1):
25
+ for y_i in range(len(y_intervals)-1):
26
+ groups[f'x : {x_intervals[x_i]} - {x_intervals[x_i+1]} | y : {y_intervals[y_i]} - {y_intervals[y_i+1]}'] = 0
27
+ return groups
28
+
29
+
30
+ #Сортирует точки по регионам
31
+ def sort_on_groups(x_vals, y_vals, x_intervals, y_intervals, groups, only_vals = False):
32
+ for x, y in zip(x_vals, y_vals):
33
+ for x_i in range(len(x_intervals)-1):
34
+ for y_i in range(len(y_intervals)-1):
35
+ if ((x_intervals[x_i] <= x < x_intervals[x_i+1]) and (y_intervals[y_i] <= y < y_intervals[y_i+1])):
36
+ groups[f'x : {x_intervals[x_i]} - {x_intervals[x_i+1]} | y : {y_intervals[y_i]} - {y_intervals[y_i+1]}'] += 1
37
+
38
+ if only_vals:
39
+ return list(groups.values())
40
+
41
+ return groups
42
+
43
+
44
+ def create_dataset(config, df):
45
+ x_intervals = split_on_intervals(config['min_xval'], config['max_xval'], config['x_ngroups'])
46
+ y_intervals = split_on_intervals(config['min_yval'], config['max_yval'], config['y_ngroups'])
47
+ groups = create_groups(x_intervals, y_intervals)
48
+
49
+ groups_values = []
50
+ for i in range(len(df)):
51
+ g = df.iloc[i]
52
+ points = np.array([[float(x['lat']), float(x['lon'])] for x in g['points']])
53
+ group_values = sort_on_groups(points[:,0], points[:,1], x_intervals, y_intervals, groups.copy(), only_vals = True)
54
+ groups_values.append(group_values)
55
+
56
+ groups_values = np.array(groups_values)
57
+ for i in range(len(groups.keys())):
58
+ groups[list(groups.keys())[i]]=groups_values[:,i]
59
+
60
+ return groups
61
+
62
+
63
+ def feature_engineering(df: pd.DataFrame, n_groups: int = 8, parse_json: bool = True, dists: bool = True,
64
+ age_feats: bool = True, normalize: bool = True, drop_zero_cols: bool = False,
65
+ process_points: bool = True) -> pd.DataFrame:
66
+
67
+ df.reset_index(drop=True, inplace=True)
68
+
69
+ if parse_json:
70
+ df_norm = pd.concat([df, pd.json_normalize(df['targetAudience'])], axis=1)
71
+ # print(df_norm)
72
+ df_norm = df_norm.drop(columns=['targetAudience'])
73
+ else:
74
+ df_norm = df
75
+
76
+ if process_points:
77
+ df_norm['points_count'] = df_norm['points'].apply(len)
78
+ if dists:
79
+ lat_center = 55.7522
80
+ lon_center = 37.6156
81
+ # print(df_norm)
82
+ df_norm['distance_center'] = df_norm['points'].apply(lambda points: [((float(point['lat']) - lat_center) ** 2 + (float(point['lon']) - lon_center) ** 2) ** 0.5 for point in points]).apply(sum) / df_norm['points_count']
83
+ lat_patr = 55.763868
84
+ lon_patr = 37.592168
85
+ df_norm['distance_patriki'] = df_norm['points'].apply(lambda points: [((float(point['lat']) - lat_patr) ** 2 + (float(point['lon']) - lon_patr) ** 2) ** 0.5 for point in points]).apply(sum) / df_norm['points_count']
86
+ lat_luzh = 55.717934
87
+ lon_luzh = 37.551932
88
+ df_norm['distance_luzhniki'] = df_norm['points'].apply(lambda points: [((float(point['lat']) - lat_luzh) ** 2 + (float(point['lon']) - lon_luzh) ** 2) ** 0.5 for point in points]).apply(sum) / df_norm['points_count']
89
+
90
+ config = {'min_xval': 55.55, 'max_xval': 55.95, 'min_yval': 37.3, 'max_yval': 37.9, 'x_ngroups': 8, 'y_ngroups': 8}
91
+ dataset = pd.DataFrame(create_dataset(config, df_norm))
92
+ if drop_zero_cols:
93
+ zero_columns = dataset.sum()[dataset.sum() == 0].index.tolist()
94
+ dataset = dataset.drop(columns=zero_columns)
95
+ else:
96
+ dataset = pd.DataFrame()
97
+
98
+ df_new = df_norm
99
+ for col in ['hash', 'points', 'name']:
100
+ try:
101
+ df_new = df_new.drop(columns=[col])
102
+ except: pass
103
+
104
+ if age_feats:
105
+ df_new['age_span'] = df_new.apply(lambda row: (row['ageTo'] - row['ageFrom']) ** 1/2, axis=1)
106
+ df_new['age_mean'] = df_new.apply(lambda row: ((row['ageTo'] + row['ageFrom']) / 2) ** 1/2, axis=1)
107
+
108
+ gender_ohe = pd.get_dummies(df_new['gender'])
109
+ try:
110
+ income_ohe = pd.get_dummies(df_new['income']).drop(columns=['ac'])
111
+ except:
112
+ print('no ac in income')
113
+ income_ohe = pd.get_dummies(df_new['income'])
114
+
115
+ df_full = pd.concat([dataset, df_new, gender_ohe, income_ohe], axis=1).drop(columns=['gender', 'income'])
116
+ df_full = df_full.fillna(0)
117
+
118
+ if normalize:
119
+ df_scaled = (df_full - df_full.mean()) / df_full.std()
120
+ df_scaled = df_scaled.fillna(0)
121
+ return df_scaled
122
+ else:
123
+ return df_full
124
+
125
+
126
+ def fit_to_base_cols(df_scaled: pd.DataFrame, base_cols) -> pd.DataFrame:
127
+ # base_df = pd.DataFrame(columns=base_cols)
128
+ # for col in df_scaled.columns:
129
+ # base_df[col] = df_scaled[col]
130
+ # base_df = base_df.fillna(0)
131
+ missing_cols = list(set(base_cols.tolist()) - set(df_scaled.columns.tolist()))
132
+ for col in missing_cols:
133
+ df_scaled[col] = 0
134
+ return df_scaled
135
+
136
+
137
+ def prep_test_df(test_df: pd.DataFrame, base_cols) -> pd.DataFrame:
138
+ sc_df = feature_engineering(test_df, parse_json=True, normalize=False, drop_zero_cols=False)
139
+ sc_df = fit_to_base_cols(sc_df, base_cols)
140
+ return sc_df
141
+
142
+
143
+ def ensemble_inference(models, X_test):
144
+ y_pred = np.zeros(len(X_test))
145
+ for model in models[:]:
146
+ y_pred += model.predict(X_test)
147
+ y_pred /= len(models)
148
+ return y_pred