Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# _____________ Import Python Libraries _________________ #
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import numpy as np
|
5 |
+
import plotly.express as px
|
6 |
+
import pandas as pd
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.linear_model import LogisticRegression
|
9 |
+
from sklearn.svm import SVC
|
10 |
+
from sklearn.tree import DecisionTreeClassifier
|
11 |
+
from sklearn.neighbors import KNeighborsClassifier
|
12 |
+
from sklearn.metrics import accuracy_score
|
13 |
+
from sklearn.impute import SimpleImputer
|
14 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
15 |
+
from sklearn.compose import ColumnTransformer
|
16 |
+
from sklearn.pipeline import Pipeline
|
17 |
+
|
18 |
+
# ________________ Page Configuration Section _____________ #
|
19 |
+
|
20 |
+
st.set_page_config(
|
21 |
+
page_title="Data Ocean",
|
22 |
+
page_icon= 'π₯'
|
23 |
+
)
|
24 |
+
|
25 |
+
# _________________ Web Page Info Section _____________________ #
|
26 |
+
|
27 |
+
st.title(":red[Data] :blue[Analytic] :orange[Portal & Machine Learning]")
|
28 |
+
st.header(":rainbow[Explore Data With Ease]")
|
29 |
+
|
30 |
+
# __________________ File Upload Section _________________ #
|
31 |
+
|
32 |
+
file = st.file_uploader('Drop Your CSV, Excel', type=['csv', 'xlsx'])
|
33 |
+
|
34 |
+
if file is not None:
|
35 |
+
try:
|
36 |
+
if file.name.endswith('csv'):
|
37 |
+
data = pd.read_csv(file)
|
38 |
+
elif file.name.endswith('xlsx'):
|
39 |
+
data = pd.read_excel(file)
|
40 |
+
else:
|
41 |
+
pass
|
42 |
+
|
43 |
+
st.dataframe(data)
|
44 |
+
st.success("File Successfully Uploaded" ,icon='π')
|
45 |
+
|
46 |
+
# ________________ Basic Info Summary Section ______________ #
|
47 |
+
|
48 |
+
st.subheader(':rainbow[Basic Information of The Dataset]',divider='violet')
|
49 |
+
tab1, tab2, tab3, tab4 ,tab5 , tab6 = st.tabs(['Summary', 'Top & Bottom Rows', 'Data Types', 'Columns','Missing Values','Duplicates Value'])
|
50 |
+
|
51 |
+
with tab1:
|
52 |
+
st.write(f'There are {data.shape[0]} Rows and {data.shape[1]} Columns in The Dataset')
|
53 |
+
st.subheader(':blue[Statistical Summary]')
|
54 |
+
st.dataframe(data.describe())
|
55 |
+
|
56 |
+
with tab2:
|
57 |
+
st.subheader(':gray[Top Rows]')
|
58 |
+
top_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='topslider')
|
59 |
+
st.dataframe(data.head(top_rows))
|
60 |
+
|
61 |
+
st.subheader(':green[Bottom Rows]')
|
62 |
+
bottom_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='bottomslider')
|
63 |
+
st.dataframe(data.tail(bottom_rows))
|
64 |
+
|
65 |
+
with tab3:
|
66 |
+
st.subheader(':orange[Data Types]')
|
67 |
+
st.write(data.dtypes.tolist())
|
68 |
+
|
69 |
+
with tab4:
|
70 |
+
st.subheader(':green[Columns]')
|
71 |
+
st.write(data.columns.tolist())
|
72 |
+
|
73 |
+
with tab5:
|
74 |
+
st.subheader(':red[Missing Values]')
|
75 |
+
missing_values = data.isnull().sum()
|
76 |
+
st.dataframe(missing_values)
|
77 |
+
if missing_values.sum() > 0:
|
78 |
+
remove_tab, fill_tab = st.tabs(['Remove Missing Values', 'Fill Missing Values'])
|
79 |
+
|
80 |
+
with remove_tab:
|
81 |
+
if st.checkbox("Remove Rows with Missing Values"):
|
82 |
+
data = data.dropna(inplace=True)
|
83 |
+
st.success('Rows with missing values removed!', icon="π")
|
84 |
+
|
85 |
+
with fill_tab:
|
86 |
+
replace_nulls = st.selectbox('Replace Missing Values With:', ['None', 'Mean', 'Median', 'Mode'])
|
87 |
+
|
88 |
+
if replace_nulls != 'None':
|
89 |
+
for col in data.select_dtypes(include=[np.number]):
|
90 |
+
if replace_nulls == 'Mean':
|
91 |
+
data[col].fillna(data[col].mean(), inplace=True)
|
92 |
+
elif replace_nulls == 'Median':
|
93 |
+
data[col].fillna(data[col].median(), inplace=True)
|
94 |
+
elif replace_nulls == 'Mode':
|
95 |
+
data[col].fillna(data[col].mode()[0], inplace=True)
|
96 |
+
st.success("Missing values replaced successfully!", icon='β
')
|
97 |
+
else:
|
98 |
+
st.success("No missing values detected.", icon='π₯')
|
99 |
+
|
100 |
+
with tab6:
|
101 |
+
st.subheader(':green[Duplicate Values]')
|
102 |
+
duplicates = data.duplicated().sum()
|
103 |
+
if duplicates ==0:
|
104 |
+
st.info(f' No Duplicates Value Found',icon='π₯')
|
105 |
+
|
106 |
+
if duplicates > 0 and st.checkbox('Remove Duplicates'):
|
107 |
+
data = data.drop_duplicates()
|
108 |
+
st.success('Duplicate rows removed!', icon='π₯')
|
109 |
+
|
110 |
+
|
111 |
+
# __________________ Value Count Section _____________________ #
|
112 |
+
|
113 |
+
st.subheader(':rainbow[Column Value Count]',divider='green')
|
114 |
+
with st.expander('Value Count'):
|
115 |
+
col1, col2 = st.columns(2)
|
116 |
+
with col1:
|
117 |
+
column = st.selectbox('Choose Column Name', options=[None] + data.columns.tolist())
|
118 |
+
with col2:
|
119 |
+
toprows = st.number_input('Number of Top Rows', min_value=1, step=1, value=5)
|
120 |
+
|
121 |
+
if column:
|
122 |
+
result = data[column].value_counts().reset_index().head(toprows)
|
123 |
+
result.columns = [column, 'count']
|
124 |
+
st.dataframe(result)
|
125 |
+
|
126 |
+
if not result.empty:
|
127 |
+
fig = px.bar(data_frame=result, x=column, y='count', template='plotly_white')
|
128 |
+
st.plotly_chart(fig)
|
129 |
+
|
130 |
+
fig = px.line(data_frame=result, x=column, y='count')
|
131 |
+
st.plotly_chart(fig)
|
132 |
+
|
133 |
+
fig = px.pie(data_frame=result, names=column, values='count')
|
134 |
+
st.plotly_chart(fig)
|
135 |
+
|
136 |
+
# ______________ GroupBy Section _________________________ #
|
137 |
+
|
138 |
+
st.subheader(':blue[Groupby : Simplify Your Data Analysis]',divider='violet')
|
139 |
+
st.write("Groupby allows you to summarize data by categories.")
|
140 |
+
|
141 |
+
with st.expander('Group By Your Columns'):
|
142 |
+
col1, col2, col3 = st.columns(3)
|
143 |
+
|
144 |
+
with col1:
|
145 |
+
groupby_cols = st.multiselect('Choose Columns to Group By', options=data.columns.tolist())
|
146 |
+
|
147 |
+
with col2:
|
148 |
+
operation_col = st.selectbox("Choose Column for Operation", options=data.columns.tolist())
|
149 |
+
|
150 |
+
with col3:
|
151 |
+
operation = st.selectbox("Choose Operation", options=['sum', 'max', 'min', 'count', 'mean', 'median'])
|
152 |
+
|
153 |
+
if groupby_cols and operation_col and operation:
|
154 |
+
result = data.groupby(groupby_cols).agg(newcol=(operation_col, operation)).reset_index()
|
155 |
+
st.dataframe(result)
|
156 |
+
|
157 |
+
st.subheader(':rainbow[Data Visualization]')
|
158 |
+
graph_type = st.selectbox('Choose Graph Type', options=['line', 'bar', 'scatter', 'pie', 'sunburst'])
|
159 |
+
|
160 |
+
if graph_type == 'line':
|
161 |
+
x_axis = st.selectbox('X Axis', options=result.columns.tolist())
|
162 |
+
y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
|
163 |
+
fig = px.line(data_frame=result, x=x_axis, y=y_axis)
|
164 |
+
st.plotly_chart(fig)
|
165 |
+
|
166 |
+
elif graph_type == 'bar':
|
167 |
+
x_axis = st.selectbox('X Axis', options=result.columns.tolist())
|
168 |
+
y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
|
169 |
+
color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
|
170 |
+
fig = px.bar(data_frame=result, x=x_axis, y=y_axis, color=color)
|
171 |
+
st.plotly_chart(fig)
|
172 |
+
|
173 |
+
elif graph_type == 'pie':
|
174 |
+
values = st.selectbox("Numerical Values", options=result.columns.tolist())
|
175 |
+
names = st.selectbox('Labels', options=result.columns.tolist())
|
176 |
+
fig = px.pie(data_frame=result, names=names, values=values)
|
177 |
+
st.plotly_chart(fig)
|
178 |
+
|
179 |
+
elif graph_type == 'scatter':
|
180 |
+
x_axis = st.selectbox('X Axis', options=result.columns.tolist())
|
181 |
+
y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
|
182 |
+
size = st.selectbox('Size Column', options=[None] + result.columns.tolist())
|
183 |
+
color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
|
184 |
+
fig = px.scatter(data_frame=result, x=x_axis, y=y_axis, color=color, size=size)
|
185 |
+
st.plotly_chart(fig)
|
186 |
+
|
187 |
+
elif graph_type == 'sunburst':
|
188 |
+
path = st.multiselect('Path', options=result.columns.tolist())
|
189 |
+
fig = px.sunburst(data_frame=result, path=path, values='newcol')
|
190 |
+
st.plotly_chart(fig)
|
191 |
+
|
192 |
+
#_________________ Machine Learning_______________ #
|
193 |
+
|
194 |
+
st.subheader(":orange[Basic Machine Learning]",divider='green')
|
195 |
+
ml_task = st.selectbox("Select ML Task", ["None", "SVM", "Logistic Regression", "Decision Tree", "K-Nearest Neighbors"])
|
196 |
+
|
197 |
+
if ml_task != "None":
|
198 |
+
target_col = st.selectbox("Select Target Column", data.columns)
|
199 |
+
feature_cols = st.multiselect("Select Feature Columns", data.columns)
|
200 |
+
|
201 |
+
if target_col and feature_cols:
|
202 |
+
X = data[feature_cols]
|
203 |
+
y = data[target_col]
|
204 |
+
|
205 |
+
# Handle Preprocessing (Categorical and Numeric Data)
|
206 |
+
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
|
207 |
+
categorical_features = X.select_dtypes(include=['object']).columns
|
208 |
+
|
209 |
+
numeric_transformer = Pipeline(steps=[
|
210 |
+
('imputer', SimpleImputer(strategy='mean')), # Handle missing data
|
211 |
+
('scaler', StandardScaler()) # Normalize numerical data
|
212 |
+
])
|
213 |
+
|
214 |
+
categorical_transformer = Pipeline(steps=[
|
215 |
+
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Handle missing data
|
216 |
+
('onehot', OneHotEncoder(handle_unknown='ignore')) # One-Hot Encode categorical features
|
217 |
+
])
|
218 |
+
|
219 |
+
preprocessor = ColumnTransformer(
|
220 |
+
transformers=[
|
221 |
+
('num', numeric_transformer, numeric_features),
|
222 |
+
('cat', categorical_transformer, categorical_features)
|
223 |
+
]
|
224 |
+
)
|
225 |
+
|
226 |
+
# Create model pipeline based on selected task
|
227 |
+
if ml_task == "SVM":
|
228 |
+
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC())])
|
229 |
+
elif ml_task == "Logistic Regression":
|
230 |
+
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])
|
231 |
+
elif ml_task == "Decision Tree":
|
232 |
+
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())])
|
233 |
+
elif ml_task == "K-Nearest Neighbors":
|
234 |
+
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', KNeighborsClassifier())])
|
235 |
+
|
236 |
+
# Split the data
|
237 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
238 |
+
|
239 |
+
# Train the model
|
240 |
+
model.fit(X_train, y_train)
|
241 |
+
y_pred = model.predict(X_test)
|
242 |
+
|
243 |
+
# Evaluate the model
|
244 |
+
accuracy = accuracy_score(y_test, y_pred)
|
245 |
+
st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
|
246 |
+
|
247 |
+
except Exception as e:
|
248 |
+
st.error(f"An error occurred: {e}")
|