File size: 6,199 Bytes
f784bc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# Tratamiento de datos
# ==============================================================================
import re
import numpy as np
import pandas as pd
from astral.sun import sun
from astral import LocationInfo
from skforecast.datasets import fetch_dataset
from feature_engine.datetime import DatetimeFeatures
from feature_engine.creation import CyclicalFeatures
from feature_engine.timeseries.forecasting import WindowFeatures
from sklearn.preprocessing import PolynomialFeatures
import sys
import os
##########################################################################################
# current_dir = os.getcwd()
# ROOT_PATH = os.path.dirname(current_dir)
# sys.path.insert(1, ROOT_PATH)
# import root
# datos = pd.read_pickle(root.DIR_DATA_STAGE + 'train.pkl')
# Variables basadas en el calendario
def calendar_features(datos):
features_to_extract = [
'month',
'week',
'day_of_week',
'hour'
]
calendar_transformer = DatetimeFeatures(
variables='index',
features_to_extract=features_to_extract,
drop_original=True,
)
variables_calendario = calendar_transformer.fit_transform(datos)[features_to_extract]
return variables_calendario
# Variables basadas en la luz solar
def solar_features(datos):
location = LocationInfo(
name = 'Taillin',
region = 'Estonia',
timezone = 'Europe/Riga',
latitude = 56.946285,
longitude = 24.105078
)
sunrise_hour = [
sun(location.observer, date=date, tzinfo=location.timezone)['sunrise']
for date in datos.index
]
sunset_hour = [
sun(location.observer, date=date, tzinfo=location.timezone)['sunset']
for date in datos.index
]
sunrise_hour = pd.Series(sunrise_hour, index=datos.index).dt.round("h").dt.hour
sunset_hour = pd.Series(sunset_hour, index=datos.index).dt.round("h").dt.hour
variables_solares = pd.DataFrame({
'sunrise_hour': sunrise_hour,
'sunset_hour': sunset_hour
})
variables_solares['daylight_hours'] = (
variables_solares['sunset_hour'] - variables_solares['sunrise_hour']
)
variables_solares["is_daylight"] = np.where(
(datos.index.hour >= variables_solares["sunrise_hour"])
& (datos.index.hour < variables_solares["sunset_hour"]),
1,
0,
)
return variables_solares
# Uni贸n de variables ex贸genas
def union_exog_features(variables_calendario, variables_solares):
assert all(variables_calendario.index == variables_solares.index)
variables_exogenas = pd.concat([
variables_calendario,
variables_solares
], axis=1)
return variables_exogenas
def ciclic_features(variables_exogenas):
features_to_encode = [
"month",
"week",
"day_of_week",
"hour",
"sunrise_hour",
"sunset_hour",
]
max_values = {
"month": 12,
"week": 52,
"day_of_week": 6,
"hour": 23,
"sunrise_hour": 23,
"sunset_hour": 23,
}
cyclical_encoder = CyclicalFeatures(
variables = features_to_encode,
max_values = max_values,
drop_original = False
)
variables_exogenas = cyclical_encoder.fit_transform(variables_exogenas)
return variables_exogenas
def pol_features(variables_exogenas):
# Interacci贸n entre variables ex贸genas
transformer_poly = PolynomialFeatures(
degree = 2,
interaction_only = True,
include_bias = False
).set_output(transform="pandas")
poly_cols = [
'month_sin',
'month_cos',
'week_sin',
'week_cos',
'day_of_week_sin',
'day_of_week_cos',
'hour_sin',
'hour_cos',
'sunrise_hour_sin',
'sunrise_hour_cos',
'sunset_hour_sin',
'sunset_hour_cos',
'daylight_hours',
'is_daylight',
]
variables_poly = transformer_poly.fit_transform(variables_exogenas[poly_cols])
variables_poly = variables_poly.drop(columns=poly_cols)
variables_poly.columns = [f"poly_{col}" for col in variables_poly.columns]
variables_poly.columns = variables_poly.columns.str.replace(" ", "__")
assert all(variables_exogenas.index == variables_poly.index)
variables_exogenas = pd.concat([variables_exogenas, variables_poly], axis=1)
return variables_exogenas
def select_exog_features(variables_exogenas):
# Selecci贸n de variables ex贸genas incluidas en el modelo
exog_features = []
# Columnas que terminan con _seno o _coseno son seleccionadas
exog_features.extend(variables_exogenas.filter(regex='_sin$|_cos$').columns.tolist())
return exog_features
def merge_df(datos,variables_exogenas, exog_features):
datos = datos.merge(variables_exogenas[exog_features],
left_index=True,
right_index=True,
how='left' # Usar solo las filas que coinciden en ambos DataFrames
)
return datos
def create_exog(datos):
# Read datasets
################### Train ######################
# Prepare date columns
variables_calendario = calendar_features(datos)
#solar features
variables_solares = solar_features(datos)
# mergin variables
variables_exogenas = union_exog_features(variables_calendario, variables_solares)
# cyclical features
variables_exogenas = ciclic_features(variables_exogenas)
# polynomial features
variables_exogenas = pol_features(variables_exogenas)
# Select exog features
exog_features = select_exog_features(variables_exogenas)
# Merge datasets
datos = merge_df(datos,variables_exogenas, exog_features)
return datos
|