| | import streamlit as st |
| | from utilities.template_helpers import upload_data |
| | import pandas as pd |
| | import numpy as np |
| | from sklearn.preprocessing import StandardScaler |
| |
|
| |
|
| | def render(): |
| | st.title("PREPROCESSING") |
| | |
| | |
| | |
| | |
| |
|
| | col1, col2, col3 = st.columns([1,1,1]) |
| |
|
| | df = None |
| | with col1.container(): |
| | df = upload_data() |
| | if df is None: |
| | return |
| | if df.shape == (0,0): |
| | return |
| | info = pd.DataFrame() |
| | info['dtypes'] = pd.DataFrame(df.dtypes) |
| | info['null'] = df.isna().sum() |
| |
|
| | tab1, tab2 = st.tabs(['Dataframe','Info']) |
| | with tab1: |
| | st.dataframe(df, use_container_width=True, height=300) |
| | with tab2: |
| | st.dataframe(info,use_container_width=True,height=300) |
| |
|
| | with col2.container(): |
| | |
| | st.write('\n\n') |
| | st.markdown('#### Drop Null Values') |
| | st.write('Drop any row containing null values') |
| | drop_null = st.checkbox('Drop') |
| | if drop_null: |
| | df.dropna(inplace=True) |
| |
|
| | |
| | st.write("\n\n") |
| | st.markdown('#### Fill Null Values') |
| | st.write("""Replace null values with mean of the column for numerical variables, |
| | and mode for categorical variables""") |
| | fill_null = st.checkbox('Fill') |
| | if fill_null: |
| | for col in df.columns: |
| | val = 0 |
| | if df[col].dtype == 'object': |
| | val = df[col].mode() |
| | else: |
| | val = df[col].mean() |
| | df[col].fillna(val) |
| |
|
| | |
| | st.write('\n\n') |
| | st.markdown("#### Scaling") |
| | st.write("Standardize numerical features by removing the mean and scaling to unit variance.") |
| | scale = st.checkbox('Scale') |
| | if scale: |
| | numerical_columns = df.select_dtypes('number').columns |
| | categorical_columns = df.select_dtypes('object').columns |
| | categorical_indexes = [] |
| |
|
| | |
| | scaler = StandardScaler() |
| | for c in categorical_columns: |
| | categorical_indexes.append(df.columns.get_loc(c)) |
| | |
| | df_scale = df.copy() |
| | |
| | for c in numerical_columns: |
| | df_scale[c] = scaler.fit_transform(df[[c]]) |
| | df = df_scale |
| |
|
| |
|
| | with col3.container(): |
| | |
| | st.write("\n\n") |
| | st.markdown("#### Choose columns") |
| | cols = st.multiselect('Select columns to use',options=list(df.columns),default=list(df.columns)) |
| | |
| | |
| | df = df[cols] |
| |
|
| | st.write("\n\n") |
| | st.markdown("#### Encode Numerical values") |
| | enc = st.checkbox('Encode') |
| | if enc: |
| | df.loc[:,df.dtypes == 'object']=df.loc[:,df.dtypes == 'object'].apply( |
| | lambda x: x.replace(x.unique(),list(range(1,1+len(x.unique()))))) |
| |
|
| | st.write('\n\n') |
| | st.markdown("#### Download Preprocessed data") |
| | st.download_button("Download Results", |
| | df.to_csv(index=False), |
| | "preprocessed.csv", |
| | "text/csv", |
| | key="download-csv") |
| | |
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |