Spaces:

dummydj2633
/

WEB_App

Sleeping

File size: 11,330 Bytes

0e99845

# _____________ Import Python Libraries _________________ #

import streamlit as st
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ________________ Page Configuration Section _____________  #

st.set_page_config(
    page_title="Data Ocean",
    page_icon= '🔥'
)

# _________________ Web Page Info Section _____________________ #

st.title(":red[Data] :blue[Analytic] :orange[Portal & Machine Learning]")
st.header(":rainbow[Explore Data With Ease]")

# __________________ File Upload Section _________________ #

file = st.file_uploader('Drop Your CSV, Excel', type=['csv', 'xlsx'])

if file is not None:
    try:
        if file.name.endswith('csv'):
            data = pd.read_csv(file)
        elif file.name.endswith('xlsx'):
            data = pd.read_excel(file)
        else:
            pass

        st.dataframe(data)
        st.success("File Successfully Uploaded" ,icon='🎉')

        # ________________ Basic Info Summary Section ______________  #

        st.subheader(':rainbow[Basic Information of The Dataset]',divider='violet')
        tab1, tab2, tab3, tab4 ,tab5 , tab6 = st.tabs(['Summary', 'Top & Bottom Rows', 'Data Types', 'Columns','Missing Values','Duplicates Value'])

        with tab1:
            st.write(f'There are {data.shape[0]} Rows and {data.shape[1]} Columns in The Dataset')
            st.subheader(':blue[Statistical Summary]')
            st.dataframe(data.describe())

        with tab2:
            st.subheader(':gray[Top Rows]')
            top_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='topslider')
            st.dataframe(data.head(top_rows))

            st.subheader(':green[Bottom Rows]')
            bottom_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='bottomslider')
            st.dataframe(data.tail(bottom_rows))

        with tab3:
            st.subheader(':orange[Data Types]')
            st.write(data.dtypes.tolist())

        with tab4:
            st.subheader(':green[Columns]')
            st.write(data.columns.tolist())

        with tab5:
            st.subheader(':red[Missing Values]')
            missing_values = data.isnull().sum()
            st.dataframe(missing_values)
            if missing_values.sum() > 0:
                remove_tab, fill_tab = st.tabs(['Remove Missing Values', 'Fill Missing Values'])

                with remove_tab:
                    if st.checkbox("Remove Rows with Missing Values"):
                        data = data.dropna(inplace=True)
                        st.success('Rows with missing values removed!', icon="🎉")

                with fill_tab:
                    replace_nulls = st.selectbox('Replace Missing Values With:', ['None', 'Mean', 'Median', 'Mode'])

                    if replace_nulls != 'None':
                        for col in data.select_dtypes(include=[np.number]):
                            if replace_nulls == 'Mean':
                                data[col].fillna(data[col].mean(), inplace=True)
                            elif replace_nulls == 'Median':
                                data[col].fillna(data[col].median(), inplace=True)
                            elif replace_nulls == 'Mode':
                                data[col].fillna(data[col].mode()[0], inplace=True)
                        st.success("Missing values replaced successfully!", icon='✅')
            else:
                st.success("No missing values detected.", icon='🔥')

        with tab6:
            st.subheader(':green[Duplicate Values]')
            duplicates = data.duplicated().sum()
            if duplicates ==0:
                st.info(f' No Duplicates Value Found',icon='🔥')

            if duplicates > 0 and st.checkbox('Remove Duplicates'):
                data = data.drop_duplicates()
                st.success('Duplicate rows removed!', icon='🔥')


        # __________________ Value Count Section _____________________ #

        st.subheader(':rainbow[Column Value Count]',divider='green')
        with st.expander('Value Count'):
            col1, col2 = st.columns(2)
            with col1:
                column = st.selectbox('Choose Column Name', options=[None] + data.columns.tolist())
            with col2:
                toprows = st.number_input('Number of Top Rows', min_value=1, step=1, value=5)

            if column:
                result = data[column].value_counts().reset_index().head(toprows)
                result.columns = [column, 'count']  
                st.dataframe(result)

                if not result.empty:
                    fig = px.bar(data_frame=result, x=column, y='count', template='plotly_white')
                    st.plotly_chart(fig)

                    fig = px.line(data_frame=result, x=column, y='count')
                    st.plotly_chart(fig)

                    fig = px.pie(data_frame=result, names=column, values='count')
                    st.plotly_chart(fig)

        # ______________ GroupBy Section _________________________ #

        st.subheader(':blue[Groupby : Simplify Your Data Analysis]',divider='violet')
        st.write("Groupby allows you to summarize data by categories.")

        with st.expander('Group By Your Columns'):
            col1, col2, col3 = st.columns(3)

            with col1:
                groupby_cols = st.multiselect('Choose Columns to Group By', options=data.columns.tolist())

            with col2:
                operation_col = st.selectbox("Choose Column for Operation", options=data.columns.tolist())

            with col3:
                operation = st.selectbox("Choose Operation", options=['sum', 'max', 'min', 'count', 'mean', 'median'])

            if groupby_cols and operation_col and operation:
                result = data.groupby(groupby_cols).agg(newcol=(operation_col, operation)).reset_index()
                st.dataframe(result)

                st.subheader(':rainbow[Data Visualization]')
                graph_type = st.selectbox('Choose Graph Type', options=['line', 'bar', 'scatter', 'pie', 'sunburst'])

                if graph_type == 'line':
                    x_axis = st.selectbox('X Axis', options=result.columns.tolist())
                    y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
                    fig = px.line(data_frame=result, x=x_axis, y=y_axis)
                    st.plotly_chart(fig)

                elif graph_type == 'bar':
                    x_axis = st.selectbox('X Axis', options=result.columns.tolist())
                    y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
                    color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
                    fig = px.bar(data_frame=result, x=x_axis, y=y_axis, color=color)
                    st.plotly_chart(fig)

                elif graph_type == 'pie':
                    values = st.selectbox("Numerical Values", options=result.columns.tolist())
                    names = st.selectbox('Labels', options=result.columns.tolist())
                    fig = px.pie(data_frame=result, names=names, values=values)
                    st.plotly_chart(fig)

                elif graph_type == 'scatter':
                    x_axis = st.selectbox('X Axis', options=result.columns.tolist())
                    y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
                    size = st.selectbox('Size Column', options=[None] + result.columns.tolist())
                    color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
                    fig = px.scatter(data_frame=result, x=x_axis, y=y_axis, color=color, size=size)
                    st.plotly_chart(fig)

                elif graph_type == 'sunburst':
                    path = st.multiselect('Path', options=result.columns.tolist())
                    fig = px.sunburst(data_frame=result, path=path, values='newcol')
                    st.plotly_chart(fig)

        #_________________ Machine Learning_______________ #

        st.subheader(":orange[Basic Machine Learning]",divider='green')
        ml_task = st.selectbox("Select ML Task", ["None", "SVM", "Logistic Regression", "Decision Tree", "K-Nearest Neighbors"])

        if ml_task != "None":
            target_col = st.selectbox("Select Target Column", data.columns)
            feature_cols = st.multiselect("Select Feature Columns", data.columns)

            if target_col and feature_cols:
                X = data[feature_cols]
                y = data[target_col]

                # Handle Preprocessing (Categorical and Numeric Data)
                numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
                categorical_features = X.select_dtypes(include=['object']).columns

                numeric_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing data
                    ('scaler', StandardScaler())  # Normalize numerical data
                ])

                categorical_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing data
                    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot Encode categorical features
                ])

                preprocessor = ColumnTransformer(
                    transformers=[
                        ('num', numeric_transformer, numeric_features),
                        ('cat', categorical_transformer, categorical_features)
                    ]
                )

                # Create model pipeline based on selected task
                if ml_task == "SVM":
                    model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC())])
                elif ml_task == "Logistic Regression":
                    model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])
                elif ml_task == "Decision Tree":
                    model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())])
                elif ml_task == "K-Nearest Neighbors":
                    model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', KNeighborsClassifier())])

                # Split the data
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

                # Train the model
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                # Evaluate the model
                accuracy = accuracy_score(y_test, y_pred)
                st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
                
    except Exception as e:
        st.error(f"An error occurred: {e}")