|
|
|
|
|
|
|
|
|
import torch |
|
import torch.nn as nn |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler |
|
from sklearn.model_selection import train_test_split |
|
from torch.utils.data import DataLoader, TensorDataset |
|
import plotly.express as px |
|
import streamlit as st |
|
from es_class_nn import SimplePlusNN2 |
|
from prediction_analyzer import PredictionAnalyzer |
|
version_name = 'CON_44' |
|
|
|
c1, c2 = st.columns([6,6]) |
|
with c2: |
|
st.image('img/logo_vidad.png', width=300, caption='https://www.continental.edu.pe/') |
|
|
|
st.title("Predicción de Abandono o Permanencia") |
|
st.write("Cargue el archivo PKL para visualizar el análisis de su contenido.") |
|
|
|
st.write('Cargue el archivo con datos nuevos aqui. Este archivo deberá seguir las pautas del diccionario de categorías y deberá estar en formato XLSX') |
|
st.write('En caso no conozca el diccionario descarguelo aquí.') |
|
st.link_button('Diccionario', 'https://huggingface.co/spaces/gestiodinamica/continental_predictivo/resolve/main/auxiliares/diccionario_variables_pkl_train.txt?download=true') |
|
|
|
uploaded_file = st.file_uploader("Cargar archivo: ", type='xlsx') |
|
|
|
df_categ = pd.read_excel('auxiliares/lista_categorias_rev.xlsx') |
|
|
|
if uploaded_file is not None: |
|
|
|
df_2 = pd.read_excel(uploaded_file) |
|
|
|
|
|
df_2 = df_2.dropna(axis=0) |
|
df_2.index = df_2.DNI |
|
df_2.drop(columns=['DNI'], inplace=True) |
|
st.write('Dimensiones de archivo ingresado: ', df_2.shape) |
|
|
|
cat_list = [] |
|
num_list = [] |
|
for col in df_2.columns: |
|
if df_2[col].dtype == 'object': |
|
if len(df_2[col].unique()) < 35: |
|
cat_list.append(col) |
|
else: |
|
num_list.append(col) |
|
|
|
df_a = df_2[cat_list] |
|
df_b = pd.get_dummies(df_a) |
|
df_b = df_b.set_index(df_2.index) |
|
|
|
scaler = MinMaxScaler() |
|
X_sc = scaler.fit_transform(df_b) |
|
df_sc = pd.DataFrame(X_sc) |
|
df_sc.columns = df_b.columns |
|
df_n = df_2[num_list] |
|
df_n = df_n.set_index(df_2.index) |
|
|
|
df_sc.index = df_b.index |
|
if (df_sc.index == df_n.index).sum()==len(df_2): |
|
st.write('Indices Consistentes Verificados.') |
|
df_r = pd.concat([df_sc, df_n], axis=1) |
|
st.write('Dimensiones del archivo procesado: ', df_r.shape) |
|
|
|
|
|
list_df_modelo = pd.read_excel('auxiliares/lista_categorias_CON_44_v0.xlsx') |
|
lista_categorias = list_df_modelo[0].to_list() |
|
|
|
cats_ad = [] |
|
cats_res = [] |
|
for c in df_r.columns: |
|
if c not in lista_categorias: |
|
cats_ad.append(c) |
|
else: |
|
cats_res.append(c) |
|
|
|
cats_dis = [] |
|
cats_res2 = [] |
|
for c in lista_categorias: |
|
if c not in df_r.columns: |
|
cats_dis.append(c) |
|
else: |
|
cats_res2.append(c) |
|
|
|
df_ad = pd.DataFrame(0, index=range(140), columns=cats_dis) |
|
df_ad.index = df_r.index |
|
df_base = df_r[cats_res] |
|
df_base.index = df_r.index |
|
df_tot = pd.concat([df_base, df_ad], axis=1) |
|
|
|
cats_test = [] |
|
for c in df_tot.columns: |
|
if c in lista_categorias: |
|
cats_test.append(c) |
|
|
|
cats_final = [] |
|
for c in lista_categorias: |
|
if c[0:6]=='ESTADO': |
|
print(c) |
|
else: |
|
cats_final.append(c) |
|
|
|
df_ready = df_tot[cats_final] |
|
|
|
|
|
st.write('Dimensiones del archivo para predicción: ', df_ready.shape) |
|
|
|
DROPOUTX = 0.10 |
|
version_name = 'CON_44' |
|
X_train = df_ready.values |
|
X_train_tensor = torch.tensor(X_train, dtype=torch.float32) |
|
st.write('Dimensiones del tensor: ', X_train_tensor.shape) |
|
input_size = X_train_tensor.shape[1] |
|
num_classes = 2 |
|
|
|
model = SimplePlusNN2(input_size, num_classes) |
|
data_path = 'models/' |
|
dict_name = f'edusights_20240702_state_dict_{version_name}.pth' |
|
st.write(dict_name) |
|
model.load_state_dict(torch.load(data_path+dict_name)) |
|
model.eval() |
|
|
|
outputs = model(X_train_tensor) |
|
outputs_show = outputs.detach().numpy().flatten() |
|
st.write('Salida: ', outputs_show.shape) |
|
outputs_show[outputs_show > 0.51] = 1.0 |
|
outputs_show[outputs_show < 0.49] = 0.0 |
|
filtered_arr = outputs_show[(outputs_show == 0.0) | (outputs_show == 1.0)] |
|
unique, counts = np.unique(filtered_arr, return_counts=True) |
|
st.write(unique, counts) |
|
pred_df = pd.DataFrame(filtered_arr) |
|
st.write('Dimensión de predicciones totales: ', pred_df.shape) |
|
df_ready_2 = df_ready.copy() |
|
df_ready_2['Predicción'] = pred_df |
|
st.write(df_ready_2.head()) |
|
|
|
analyzer = PredictionAnalyzer(model, df_ready) |
|
results_df = analyzer.predictions_loop() |
|
st.write(results_df) |
|
|
|
csv_out = df_ready_2.to_csv(encoding='iso-8859-1') |
|
st.download_button( |
|
label="Descargar Predicciones Totales", data=csv_out, |
|
file_name='predicciones_carga.csv', mime='text/csv' |
|
) |
|
|
|
csv_out = results_df.to_csv(encoding='iso-8859-1') |
|
st.download_button( |
|
label="Descargar Predicciones por Categoría", data=csv_out, |
|
file_name='predicciones_carga.csv', mime='text/csv' |
|
) |
|
|
|
c3, c4 = st.columns([6,6]) |
|
with c3: |
|
st.image('img/gdmklogo.png', width=100, caption='Powered by GestioDinámica 2024') |
|
|