Spaces:
Sleeping
Sleeping
# _____________ Import Python Libraries _________________ # | |
import streamlit as st | |
import numpy as np | |
import plotly.express as px | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.metrics import accuracy_score | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import OneHotEncoder, StandardScaler | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
# ________________ Page Configuration Section _____________ # | |
st.set_page_config( | |
page_title="Data Ocean", | |
page_icon= 'π₯' | |
) | |
# _________________ Web Page Info Section _____________________ # | |
st.title(":red[Data] :blue[Analytic] :orange[Portal & Machine Learning]") | |
st.header(":rainbow[Explore Data With Ease]") | |
# __________________ File Upload Section _________________ # | |
file = st.file_uploader('Drop Your CSV, Excel', type=['csv', 'xlsx']) | |
if file is not None: | |
try: | |
if file.name.endswith('csv'): | |
data = pd.read_csv(file) | |
elif file.name.endswith('xlsx'): | |
data = pd.read_excel(file) | |
else: | |
pass | |
st.dataframe(data) | |
st.success("File Successfully Uploaded" ,icon='π') | |
# ________________ Basic Info Summary Section ______________ # | |
st.subheader(':rainbow[Basic Information of The Dataset]',divider='violet') | |
tab1, tab2, tab3, tab4 ,tab5 , tab6 = st.tabs(['Summary', 'Top & Bottom Rows', 'Data Types', 'Columns','Missing Values','Duplicates Value']) | |
with tab1: | |
st.write(f'There are {data.shape[0]} Rows and {data.shape[1]} Columns in The Dataset') | |
st.subheader(':blue[Statistical Summary]') | |
st.dataframe(data.describe()) | |
with tab2: | |
st.subheader(':gray[Top Rows]') | |
top_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='topslider') | |
st.dataframe(data.head(top_rows)) | |
st.subheader(':green[Bottom Rows]') | |
bottom_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='bottomslider') | |
st.dataframe(data.tail(bottom_rows)) | |
with tab3: | |
st.subheader(':orange[Data Types]') | |
st.write(data.dtypes.tolist()) | |
with tab4: | |
st.subheader(':green[Columns]') | |
st.write(data.columns.tolist()) | |
with tab5: | |
st.subheader(':red[Missing Values]') | |
missing_values = data.isnull().sum() | |
st.dataframe(missing_values) | |
if missing_values.sum() > 0: | |
remove_tab, fill_tab = st.tabs(['Remove Missing Values', 'Fill Missing Values']) | |
with remove_tab: | |
if st.checkbox("Remove Rows with Missing Values"): | |
data = data.dropna(inplace=True) | |
st.success('Rows with missing values removed!', icon="π") | |
with fill_tab: | |
replace_nulls = st.selectbox('Replace Missing Values With:', ['None', 'Mean', 'Median', 'Mode']) | |
if replace_nulls != 'None': | |
for col in data.select_dtypes(include=[np.number]): | |
if replace_nulls == 'Mean': | |
data[col].fillna(data[col].mean(), inplace=True) | |
elif replace_nulls == 'Median': | |
data[col].fillna(data[col].median(), inplace=True) | |
elif replace_nulls == 'Mode': | |
data[col].fillna(data[col].mode()[0], inplace=True) | |
st.success("Missing values replaced successfully!", icon='β ') | |
else: | |
st.success("No missing values detected.", icon='π₯') | |
with tab6: | |
st.subheader(':green[Duplicate Values]') | |
duplicates = data.duplicated().sum() | |
if duplicates ==0: | |
st.info(f' No Duplicates Value Found',icon='π₯') | |
if duplicates > 0 and st.checkbox('Remove Duplicates'): | |
data = data.drop_duplicates() | |
st.success('Duplicate rows removed!', icon='π₯') | |
# __________________ Value Count Section _____________________ # | |
st.subheader(':rainbow[Column Value Count]',divider='green') | |
with st.expander('Value Count'): | |
col1, col2 = st.columns(2) | |
with col1: | |
column = st.selectbox('Choose Column Name', options=[None] + data.columns.tolist()) | |
with col2: | |
toprows = st.number_input('Number of Top Rows', min_value=1, step=1, value=5) | |
if column: | |
result = data[column].value_counts().reset_index().head(toprows) | |
result.columns = [column, 'count'] | |
st.dataframe(result) | |
if not result.empty: | |
fig = px.bar(data_frame=result, x=column, y='count', template='plotly_white') | |
st.plotly_chart(fig) | |
fig = px.line(data_frame=result, x=column, y='count') | |
st.plotly_chart(fig) | |
fig = px.pie(data_frame=result, names=column, values='count') | |
st.plotly_chart(fig) | |
# ______________ GroupBy Section _________________________ # | |
st.subheader(':blue[Groupby : Simplify Your Data Analysis]',divider='violet') | |
st.write("Groupby allows you to summarize data by categories.") | |
with st.expander('Group By Your Columns'): | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
groupby_cols = st.multiselect('Choose Columns to Group By', options=data.columns.tolist()) | |
with col2: | |
operation_col = st.selectbox("Choose Column for Operation", options=data.columns.tolist()) | |
with col3: | |
operation = st.selectbox("Choose Operation", options=['sum', 'max', 'min', 'count', 'mean', 'median']) | |
if groupby_cols and operation_col and operation: | |
result = data.groupby(groupby_cols).agg(newcol=(operation_col, operation)).reset_index() | |
st.dataframe(result) | |
st.subheader(':rainbow[Data Visualization]') | |
graph_type = st.selectbox('Choose Graph Type', options=['line', 'bar', 'scatter', 'pie', 'sunburst']) | |
if graph_type == 'line': | |
x_axis = st.selectbox('X Axis', options=result.columns.tolist()) | |
y_axis = st.selectbox('Y Axis', options=result.columns.tolist()) | |
fig = px.line(data_frame=result, x=x_axis, y=y_axis) | |
st.plotly_chart(fig) | |
elif graph_type == 'bar': | |
x_axis = st.selectbox('X Axis', options=result.columns.tolist()) | |
y_axis = st.selectbox('Y Axis', options=result.columns.tolist()) | |
color = st.selectbox('Color Information', options=[None] + result.columns.tolist()) | |
fig = px.bar(data_frame=result, x=x_axis, y=y_axis, color=color) | |
st.plotly_chart(fig) | |
elif graph_type == 'pie': | |
values = st.selectbox("Numerical Values", options=result.columns.tolist()) | |
names = st.selectbox('Labels', options=result.columns.tolist()) | |
fig = px.pie(data_frame=result, names=names, values=values) | |
st.plotly_chart(fig) | |
elif graph_type == 'scatter': | |
x_axis = st.selectbox('X Axis', options=result.columns.tolist()) | |
y_axis = st.selectbox('Y Axis', options=result.columns.tolist()) | |
size = st.selectbox('Size Column', options=[None] + result.columns.tolist()) | |
color = st.selectbox('Color Information', options=[None] + result.columns.tolist()) | |
fig = px.scatter(data_frame=result, x=x_axis, y=y_axis, color=color, size=size) | |
st.plotly_chart(fig) | |
elif graph_type == 'sunburst': | |
path = st.multiselect('Path', options=result.columns.tolist()) | |
fig = px.sunburst(data_frame=result, path=path, values='newcol') | |
st.plotly_chart(fig) | |
#_________________ Machine Learning_______________ # | |
st.subheader(":orange[Basic Machine Learning]",divider='green') | |
ml_task = st.selectbox("Select ML Task", ["None", "SVM", "Logistic Regression", "Decision Tree", "K-Nearest Neighbors"]) | |
if ml_task != "None": | |
target_col = st.selectbox("Select Target Column", data.columns) | |
feature_cols = st.multiselect("Select Feature Columns", data.columns) | |
if target_col and feature_cols: | |
X = data[feature_cols] | |
y = data[target_col] | |
# Handle Preprocessing (Categorical and Numeric Data) | |
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns | |
categorical_features = X.select_dtypes(include=['object']).columns | |
numeric_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='mean')), # Handle missing data | |
('scaler', StandardScaler()) # Normalize numerical data | |
]) | |
categorical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Handle missing data | |
('onehot', OneHotEncoder(handle_unknown='ignore')) # One-Hot Encode categorical features | |
]) | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', numeric_transformer, numeric_features), | |
('cat', categorical_transformer, categorical_features) | |
] | |
) | |
# Create model pipeline based on selected task | |
if ml_task == "SVM": | |
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC())]) | |
elif ml_task == "Logistic Regression": | |
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) | |
elif ml_task == "Decision Tree": | |
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())]) | |
elif ml_task == "K-Nearest Neighbors": | |
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', KNeighborsClassifier())]) | |
# Split the data | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Train the model | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
# Evaluate the model | |
accuracy = accuracy_score(y_test, y_pred) | |
st.write(f"Model Accuracy: {accuracy * 100:.2f}%") | |
except Exception as e: | |
st.error(f"An error occurred: {e}") |