Spaces:

dummydj2633
/

WEB_App

Running

App Files Files Community

WEB_App / app.py

dummydj2633

Create app.py

0e99845 verified 7 months ago

raw

history blame contribute delete

11.3 kB

	# _____________ Import Python Libraries _________________ #

	import streamlit as st
	import numpy as np
	import plotly.express as px
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.metrics import accuracy_score
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import OneHotEncoder, StandardScaler
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline

	# ________________ Page Configuration Section _____________ #

	st.set_page_config(
	page_title="Data Ocean",
	page_icon= '🔥'
	)

	# _________________ Web Page Info Section _____________________ #

	st.title(":red[Data] :blue[Analytic] :orange[Portal & Machine Learning]")
	st.header(":rainbow[Explore Data With Ease]")

	# __________________ File Upload Section _________________ #

	file = st.file_uploader('Drop Your CSV, Excel', type=['csv', 'xlsx'])

	if file is not None:
	try:
	if file.name.endswith('csv'):
	data = pd.read_csv(file)
	elif file.name.endswith('xlsx'):
	data = pd.read_excel(file)
	else:
	pass

	st.dataframe(data)
	st.success("File Successfully Uploaded" ,icon='🎉')

	# ________________ Basic Info Summary Section ______________ #

	st.subheader(':rainbow[Basic Information of The Dataset]',divider='violet')
	tab1, tab2, tab3, tab4 ,tab5 , tab6 = st.tabs(['Summary', 'Top & Bottom Rows', 'Data Types', 'Columns','Missing Values','Duplicates Value'])

	with tab1:
	st.write(f'There are {data.shape[0]} Rows and {data.shape[1]} Columns in The Dataset')
	st.subheader(':blue[Statistical Summary]')
	st.dataframe(data.describe())

	with tab2:
	st.subheader(':gray[Top Rows]')
	top_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='topslider')
	st.dataframe(data.head(top_rows))

	st.subheader(':green[Bottom Rows]')
	bottom_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='bottomslider')
	st.dataframe(data.tail(bottom_rows))

	with tab3:
	st.subheader(':orange[Data Types]')
	st.write(data.dtypes.tolist())

	with tab4:
	st.subheader(':green[Columns]')
	st.write(data.columns.tolist())

	with tab5:
	st.subheader(':red[Missing Values]')
	missing_values = data.isnull().sum()
	st.dataframe(missing_values)
	if missing_values.sum() > 0:
	remove_tab, fill_tab = st.tabs(['Remove Missing Values', 'Fill Missing Values'])

	with remove_tab:
	if st.checkbox("Remove Rows with Missing Values"):
	data = data.dropna(inplace=True)
	st.success('Rows with missing values removed!', icon="🎉")

	with fill_tab:
	replace_nulls = st.selectbox('Replace Missing Values With:', ['None', 'Mean', 'Median', 'Mode'])

	if replace_nulls != 'None':
	for col in data.select_dtypes(include=[np.number]):
	if replace_nulls == 'Mean':
	data[col].fillna(data[col].mean(), inplace=True)
	elif replace_nulls == 'Median':
	data[col].fillna(data[col].median(), inplace=True)
	elif replace_nulls == 'Mode':
	data[col].fillna(data[col].mode()[0], inplace=True)
	st.success("Missing values replaced successfully!", icon='✅')
	else:
	st.success("No missing values detected.", icon='🔥')

	with tab6:
	st.subheader(':green[Duplicate Values]')
	duplicates = data.duplicated().sum()
	if duplicates ==0:
	st.info(f' No Duplicates Value Found',icon='🔥')

	if duplicates > 0 and st.checkbox('Remove Duplicates'):
	data = data.drop_duplicates()
	st.success('Duplicate rows removed!', icon='🔥')


	# __________________ Value Count Section _____________________ #

	st.subheader(':rainbow[Column Value Count]',divider='green')
	with st.expander('Value Count'):
	col1, col2 = st.columns(2)
	with col1:
	column = st.selectbox('Choose Column Name', options=[None] + data.columns.tolist())
	with col2:
	toprows = st.number_input('Number of Top Rows', min_value=1, step=1, value=5)

	if column:
	result = data[column].value_counts().reset_index().head(toprows)
	result.columns = [column, 'count']
	st.dataframe(result)

	if not result.empty:
	fig = px.bar(data_frame=result, x=column, y='count', template='plotly_white')
	st.plotly_chart(fig)

	fig = px.line(data_frame=result, x=column, y='count')
	st.plotly_chart(fig)

	fig = px.pie(data_frame=result, names=column, values='count')
	st.plotly_chart(fig)

	# ______________ GroupBy Section _________________________ #

	st.subheader(':blue[Groupby : Simplify Your Data Analysis]',divider='violet')
	st.write("Groupby allows you to summarize data by categories.")

	with st.expander('Group By Your Columns'):
	col1, col2, col3 = st.columns(3)

	with col1:
	groupby_cols = st.multiselect('Choose Columns to Group By', options=data.columns.tolist())

	with col2:
	operation_col = st.selectbox("Choose Column for Operation", options=data.columns.tolist())

	with col3:
	operation = st.selectbox("Choose Operation", options=['sum', 'max', 'min', 'count', 'mean', 'median'])

	if groupby_cols and operation_col and operation:
	result = data.groupby(groupby_cols).agg(newcol=(operation_col, operation)).reset_index()
	st.dataframe(result)

	st.subheader(':rainbow[Data Visualization]')
	graph_type = st.selectbox('Choose Graph Type', options=['line', 'bar', 'scatter', 'pie', 'sunburst'])

	if graph_type == 'line':
	x_axis = st.selectbox('X Axis', options=result.columns.tolist())
	y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
	fig = px.line(data_frame=result, x=x_axis, y=y_axis)
	st.plotly_chart(fig)

	elif graph_type == 'bar':
	x_axis = st.selectbox('X Axis', options=result.columns.tolist())
	y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
	color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
	fig = px.bar(data_frame=result, x=x_axis, y=y_axis, color=color)
	st.plotly_chart(fig)

	elif graph_type == 'pie':
	values = st.selectbox("Numerical Values", options=result.columns.tolist())
	names = st.selectbox('Labels', options=result.columns.tolist())
	fig = px.pie(data_frame=result, names=names, values=values)
	st.plotly_chart(fig)

	elif graph_type == 'scatter':
	x_axis = st.selectbox('X Axis', options=result.columns.tolist())
	y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
	size = st.selectbox('Size Column', options=[None] + result.columns.tolist())
	color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
	fig = px.scatter(data_frame=result, x=x_axis, y=y_axis, color=color, size=size)
	st.plotly_chart(fig)

	elif graph_type == 'sunburst':
	path = st.multiselect('Path', options=result.columns.tolist())
	fig = px.sunburst(data_frame=result, path=path, values='newcol')
	st.plotly_chart(fig)

	#_________________ Machine Learning_______________ #

	st.subheader(":orange[Basic Machine Learning]",divider='green')
	ml_task = st.selectbox("Select ML Task", ["None", "SVM", "Logistic Regression", "Decision Tree", "K-Nearest Neighbors"])

	if ml_task != "None":
	target_col = st.selectbox("Select Target Column", data.columns)
	feature_cols = st.multiselect("Select Feature Columns", data.columns)

	if target_col and feature_cols:
	X = data[feature_cols]
	y = data[target_col]

	# Handle Preprocessing (Categorical and Numeric Data)
	numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
	categorical_features = X.select_dtypes(include=['object']).columns

	numeric_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='mean')), # Handle missing data
	('scaler', StandardScaler()) # Normalize numerical data
	])

	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Handle missing data
	('onehot', OneHotEncoder(handle_unknown='ignore')) # One-Hot Encode categorical features
	])

	preprocessor = ColumnTransformer(
	transformers=[
	('num', numeric_transformer, numeric_features),
	('cat', categorical_transformer, categorical_features)
	]
	)

	# Create model pipeline based on selected task
	if ml_task == "SVM":
	model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC())])
	elif ml_task == "Logistic Regression":
	model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])
	elif ml_task == "Decision Tree":
	model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())])
	elif ml_task == "K-Nearest Neighbors":
	model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', KNeighborsClassifier())])

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Train the model
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	# Evaluate the model
	accuracy = accuracy_score(y_test, y_pred)
	st.write(f"Model Accuracy: {accuracy * 100:.2f}%")

	except Exception as e:
	st.error(f"An error occurred: {e}")