Spaces:
Sleeping
Sleeping
Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from sklearn.model_selection import ShuffleSplit, train_test_split, GridSearchCV
|
5 |
+
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
|
6 |
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
7 |
+
|
8 |
+
|
9 |
+
SEED = 8642
|
10 |
+
np.random.seed(SEED)
|
11 |
+
|
12 |
+
config = {'min_xval':55.55, 'max_xval':55.95, 'min_yval':37.3, 'max_yval':37.9, 'x_ngroups': 8, 'y_ngroups': 8}
|
13 |
+
|
14 |
+
def split_on_intervals(min_val, max_val, n):
|
15 |
+
step = (max_val - min_val)/n
|
16 |
+
intervals = [min_val+(step*x) for x in range(n+1)]
|
17 |
+
return intervals
|
18 |
+
|
19 |
+
|
20 |
+
def create_groups(x_intervals, y_intervals):
|
21 |
+
groups = {}
|
22 |
+
x_intervals = np.concatenate([[-np.inf], x_intervals, [np.inf]])
|
23 |
+
y_intervals = np.concatenate([[-np.inf], y_intervals, [np.inf]])
|
24 |
+
for x_i in range(len(x_intervals)-1):
|
25 |
+
for y_i in range(len(y_intervals)-1):
|
26 |
+
groups[f'x : {x_intervals[x_i]} - {x_intervals[x_i+1]} | y : {y_intervals[y_i]} - {y_intervals[y_i+1]}'] = 0
|
27 |
+
return groups
|
28 |
+
|
29 |
+
|
30 |
+
#Сортирует точки по регионам
|
31 |
+
def sort_on_groups(x_vals, y_vals, x_intervals, y_intervals, groups, only_vals = False):
|
32 |
+
for x, y in zip(x_vals, y_vals):
|
33 |
+
for x_i in range(len(x_intervals)-1):
|
34 |
+
for y_i in range(len(y_intervals)-1):
|
35 |
+
if ((x_intervals[x_i] <= x < x_intervals[x_i+1]) and (y_intervals[y_i] <= y < y_intervals[y_i+1])):
|
36 |
+
groups[f'x : {x_intervals[x_i]} - {x_intervals[x_i+1]} | y : {y_intervals[y_i]} - {y_intervals[y_i+1]}'] += 1
|
37 |
+
|
38 |
+
if only_vals:
|
39 |
+
return list(groups.values())
|
40 |
+
|
41 |
+
return groups
|
42 |
+
|
43 |
+
|
44 |
+
def create_dataset(config, df):
|
45 |
+
x_intervals = split_on_intervals(config['min_xval'], config['max_xval'], config['x_ngroups'])
|
46 |
+
y_intervals = split_on_intervals(config['min_yval'], config['max_yval'], config['y_ngroups'])
|
47 |
+
groups = create_groups(x_intervals, y_intervals)
|
48 |
+
|
49 |
+
groups_values = []
|
50 |
+
for i in range(len(df)):
|
51 |
+
g = df.iloc[i]
|
52 |
+
points = np.array([[float(x['lat']), float(x['lon'])] for x in g['points']])
|
53 |
+
group_values = sort_on_groups(points[:,0], points[:,1], x_intervals, y_intervals, groups.copy(), only_vals = True)
|
54 |
+
groups_values.append(group_values)
|
55 |
+
|
56 |
+
groups_values = np.array(groups_values)
|
57 |
+
for i in range(len(groups.keys())):
|
58 |
+
groups[list(groups.keys())[i]]=groups_values[:,i]
|
59 |
+
|
60 |
+
return groups
|
61 |
+
|
62 |
+
|
63 |
+
def feature_engineering(df: pd.DataFrame, n_groups: int = 8, parse_json: bool = True, dists: bool = True,
|
64 |
+
age_feats: bool = True, normalize: bool = True, drop_zero_cols: bool = False,
|
65 |
+
process_points: bool = True) -> pd.DataFrame:
|
66 |
+
|
67 |
+
df.reset_index(drop=True, inplace=True)
|
68 |
+
|
69 |
+
if parse_json:
|
70 |
+
df_norm = pd.concat([df, pd.json_normalize(df['targetAudience'])], axis=1)
|
71 |
+
# print(df_norm)
|
72 |
+
df_norm = df_norm.drop(columns=['targetAudience'])
|
73 |
+
else:
|
74 |
+
df_norm = df
|
75 |
+
|
76 |
+
if process_points:
|
77 |
+
df_norm['points_count'] = df_norm['points'].apply(len)
|
78 |
+
if dists:
|
79 |
+
lat_center = 55.7522
|
80 |
+
lon_center = 37.6156
|
81 |
+
# print(df_norm)
|
82 |
+
df_norm['distance_center'] = df_norm['points'].apply(lambda points: [((float(point['lat']) - lat_center) ** 2 + (float(point['lon']) - lon_center) ** 2) ** 0.5 for point in points]).apply(sum) / df_norm['points_count']
|
83 |
+
lat_patr = 55.763868
|
84 |
+
lon_patr = 37.592168
|
85 |
+
df_norm['distance_patriki'] = df_norm['points'].apply(lambda points: [((float(point['lat']) - lat_patr) ** 2 + (float(point['lon']) - lon_patr) ** 2) ** 0.5 for point in points]).apply(sum) / df_norm['points_count']
|
86 |
+
lat_luzh = 55.717934
|
87 |
+
lon_luzh = 37.551932
|
88 |
+
df_norm['distance_luzhniki'] = df_norm['points'].apply(lambda points: [((float(point['lat']) - lat_luzh) ** 2 + (float(point['lon']) - lon_luzh) ** 2) ** 0.5 for point in points]).apply(sum) / df_norm['points_count']
|
89 |
+
|
90 |
+
config = {'min_xval': 55.55, 'max_xval': 55.95, 'min_yval': 37.3, 'max_yval': 37.9, 'x_ngroups': 8, 'y_ngroups': 8}
|
91 |
+
dataset = pd.DataFrame(create_dataset(config, df_norm))
|
92 |
+
if drop_zero_cols:
|
93 |
+
zero_columns = dataset.sum()[dataset.sum() == 0].index.tolist()
|
94 |
+
dataset = dataset.drop(columns=zero_columns)
|
95 |
+
else:
|
96 |
+
dataset = pd.DataFrame()
|
97 |
+
|
98 |
+
df_new = df_norm
|
99 |
+
for col in ['hash', 'points', 'name']:
|
100 |
+
try:
|
101 |
+
df_new = df_new.drop(columns=[col])
|
102 |
+
except: pass
|
103 |
+
|
104 |
+
if age_feats:
|
105 |
+
df_new['age_span'] = df_new.apply(lambda row: (row['ageTo'] - row['ageFrom']) ** 1/2, axis=1)
|
106 |
+
df_new['age_mean'] = df_new.apply(lambda row: ((row['ageTo'] + row['ageFrom']) / 2) ** 1/2, axis=1)
|
107 |
+
|
108 |
+
gender_ohe = pd.get_dummies(df_new['gender'])
|
109 |
+
try:
|
110 |
+
income_ohe = pd.get_dummies(df_new['income']).drop(columns=['ac'])
|
111 |
+
except:
|
112 |
+
print('no ac in income')
|
113 |
+
income_ohe = pd.get_dummies(df_new['income'])
|
114 |
+
|
115 |
+
df_full = pd.concat([dataset, df_new, gender_ohe, income_ohe], axis=1).drop(columns=['gender', 'income'])
|
116 |
+
df_full = df_full.fillna(0)
|
117 |
+
|
118 |
+
if normalize:
|
119 |
+
df_scaled = (df_full - df_full.mean()) / df_full.std()
|
120 |
+
df_scaled = df_scaled.fillna(0)
|
121 |
+
return df_scaled
|
122 |
+
else:
|
123 |
+
return df_full
|
124 |
+
|
125 |
+
|
126 |
+
def fit_to_base_cols(df_scaled: pd.DataFrame, base_cols) -> pd.DataFrame:
|
127 |
+
# base_df = pd.DataFrame(columns=base_cols)
|
128 |
+
# for col in df_scaled.columns:
|
129 |
+
# base_df[col] = df_scaled[col]
|
130 |
+
# base_df = base_df.fillna(0)
|
131 |
+
missing_cols = list(set(base_cols.tolist()) - set(df_scaled.columns.tolist()))
|
132 |
+
for col in missing_cols:
|
133 |
+
df_scaled[col] = 0
|
134 |
+
return df_scaled
|
135 |
+
|
136 |
+
|
137 |
+
def prep_test_df(test_df: pd.DataFrame, base_cols) -> pd.DataFrame:
|
138 |
+
sc_df = feature_engineering(test_df, parse_json=True, normalize=False, drop_zero_cols=False)
|
139 |
+
sc_df = fit_to_base_cols(sc_df, base_cols)
|
140 |
+
return sc_df
|
141 |
+
|
142 |
+
|
143 |
+
def ensemble_inference(models, X_test):
|
144 |
+
y_pred = np.zeros(len(X_test))
|
145 |
+
for model in models[:]:
|
146 |
+
y_pred += model.predict(X_test)
|
147 |
+
y_pred /= len(models)
|
148 |
+
return y_pred
|