|
import streamlit as st |
|
from utilities.template_helpers import upload_data |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
|
|
def render(): |
|
st.title("PREPROCESSING") |
|
|
|
|
|
|
|
|
|
|
|
col1, col2, col3 = st.columns([1,1,1]) |
|
|
|
df = None |
|
with col1.container(): |
|
df = upload_data() |
|
if df is None: |
|
return |
|
if df.shape == (0,0): |
|
return |
|
info = pd.DataFrame() |
|
info['dtypes'] = pd.DataFrame(df.dtypes) |
|
info['null'] = df.isna().sum() |
|
|
|
tab1, tab2 = st.tabs(['Dataframe','Info']) |
|
with tab1: |
|
st.dataframe(df, use_container_width=True, height=300) |
|
with tab2: |
|
st.dataframe(info,use_container_width=True,height=300) |
|
|
|
with col2.container(): |
|
|
|
st.write('\n\n') |
|
st.markdown('#### Drop Null Values') |
|
st.write('Drop any row containing null values') |
|
drop_null = st.checkbox('Drop') |
|
if drop_null: |
|
df.dropna(inplace=True) |
|
|
|
|
|
st.write("\n\n") |
|
st.markdown('#### Fill Null Values') |
|
st.write("""Replace null values with mean of the column for numerical variables, |
|
and mode for categorical variables""") |
|
fill_null = st.checkbox('Fill') |
|
if fill_null: |
|
for col in df.columns: |
|
val = 0 |
|
if df[col].dtype == 'object': |
|
val = df[col].mode() |
|
else: |
|
val = df[col].mean() |
|
df[col].fillna(val) |
|
|
|
|
|
st.write('\n\n') |
|
st.markdown("#### Scaling") |
|
st.write("Standardize numerical features by removing the mean and scaling to unit variance.") |
|
scale = st.checkbox('Scale') |
|
if scale: |
|
numerical_columns = df.select_dtypes('number').columns |
|
categorical_columns = df.select_dtypes('object').columns |
|
categorical_indexes = [] |
|
|
|
|
|
scaler = StandardScaler() |
|
for c in categorical_columns: |
|
categorical_indexes.append(df.columns.get_loc(c)) |
|
|
|
df_scale = df.copy() |
|
|
|
for c in numerical_columns: |
|
df_scale[c] = scaler.fit_transform(df[[c]]) |
|
df = df_scale |
|
|
|
|
|
with col3.container(): |
|
|
|
st.write("\n\n") |
|
st.markdown("#### Choose columns") |
|
cols = st.multiselect('Select columns to use',options=list(df.columns),default=list(df.columns)) |
|
|
|
|
|
df = df[cols] |
|
|
|
st.write("\n\n") |
|
st.markdown("#### Encode Numerical values") |
|
enc = st.checkbox('Encode') |
|
if enc: |
|
df.loc[:,df.dtypes == 'object']=df.loc[:,df.dtypes == 'object'].apply( |
|
lambda x: x.replace(x.unique(),list(range(1,1+len(x.unique()))))) |
|
|
|
st.write('\n\n') |
|
st.markdown("#### Download Preprocessed data") |
|
st.download_button("Download Results", |
|
df.to_csv(index=False), |
|
"preprocessed.csv", |
|
"text/csv", |
|
key="download-csv") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|