dummydj2633 commited on
Commit
0e99845
Β·
verified Β·
1 Parent(s): fa4cd41

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -0
app.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # _____________ Import Python Libraries _________________ #
2
+
3
+ import streamlit as st
4
+ import numpy as np
5
+ import plotly.express as px
6
+ import pandas as pd
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import LogisticRegression
9
+ from sklearn.svm import SVC
10
+ from sklearn.tree import DecisionTreeClassifier
11
+ from sklearn.neighbors import KNeighborsClassifier
12
+ from sklearn.metrics import accuracy_score
13
+ from sklearn.impute import SimpleImputer
14
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
15
+ from sklearn.compose import ColumnTransformer
16
+ from sklearn.pipeline import Pipeline
17
+
18
+ # ________________ Page Configuration Section _____________ #
19
+
20
+ st.set_page_config(
21
+ page_title="Data Ocean",
22
+ page_icon= 'πŸ”₯'
23
+ )
24
+
25
+ # _________________ Web Page Info Section _____________________ #
26
+
27
+ st.title(":red[Data] :blue[Analytic] :orange[Portal & Machine Learning]")
28
+ st.header(":rainbow[Explore Data With Ease]")
29
+
30
+ # __________________ File Upload Section _________________ #
31
+
32
+ file = st.file_uploader('Drop Your CSV, Excel', type=['csv', 'xlsx'])
33
+
34
+ if file is not None:
35
+ try:
36
+ if file.name.endswith('csv'):
37
+ data = pd.read_csv(file)
38
+ elif file.name.endswith('xlsx'):
39
+ data = pd.read_excel(file)
40
+ else:
41
+ pass
42
+
43
+ st.dataframe(data)
44
+ st.success("File Successfully Uploaded" ,icon='πŸŽ‰')
45
+
46
+ # ________________ Basic Info Summary Section ______________ #
47
+
48
+ st.subheader(':rainbow[Basic Information of The Dataset]',divider='violet')
49
+ tab1, tab2, tab3, tab4 ,tab5 , tab6 = st.tabs(['Summary', 'Top & Bottom Rows', 'Data Types', 'Columns','Missing Values','Duplicates Value'])
50
+
51
+ with tab1:
52
+ st.write(f'There are {data.shape[0]} Rows and {data.shape[1]} Columns in The Dataset')
53
+ st.subheader(':blue[Statistical Summary]')
54
+ st.dataframe(data.describe())
55
+
56
+ with tab2:
57
+ st.subheader(':gray[Top Rows]')
58
+ top_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='topslider')
59
+ st.dataframe(data.head(top_rows))
60
+
61
+ st.subheader(':green[Bottom Rows]')
62
+ bottom_rows = st.slider('Number of Rows to Fetch', 1, data.shape[0], key='bottomslider')
63
+ st.dataframe(data.tail(bottom_rows))
64
+
65
+ with tab3:
66
+ st.subheader(':orange[Data Types]')
67
+ st.write(data.dtypes.tolist())
68
+
69
+ with tab4:
70
+ st.subheader(':green[Columns]')
71
+ st.write(data.columns.tolist())
72
+
73
+ with tab5:
74
+ st.subheader(':red[Missing Values]')
75
+ missing_values = data.isnull().sum()
76
+ st.dataframe(missing_values)
77
+ if missing_values.sum() > 0:
78
+ remove_tab, fill_tab = st.tabs(['Remove Missing Values', 'Fill Missing Values'])
79
+
80
+ with remove_tab:
81
+ if st.checkbox("Remove Rows with Missing Values"):
82
+ data = data.dropna(inplace=True)
83
+ st.success('Rows with missing values removed!', icon="πŸŽ‰")
84
+
85
+ with fill_tab:
86
+ replace_nulls = st.selectbox('Replace Missing Values With:', ['None', 'Mean', 'Median', 'Mode'])
87
+
88
+ if replace_nulls != 'None':
89
+ for col in data.select_dtypes(include=[np.number]):
90
+ if replace_nulls == 'Mean':
91
+ data[col].fillna(data[col].mean(), inplace=True)
92
+ elif replace_nulls == 'Median':
93
+ data[col].fillna(data[col].median(), inplace=True)
94
+ elif replace_nulls == 'Mode':
95
+ data[col].fillna(data[col].mode()[0], inplace=True)
96
+ st.success("Missing values replaced successfully!", icon='βœ…')
97
+ else:
98
+ st.success("No missing values detected.", icon='πŸ”₯')
99
+
100
+ with tab6:
101
+ st.subheader(':green[Duplicate Values]')
102
+ duplicates = data.duplicated().sum()
103
+ if duplicates ==0:
104
+ st.info(f' No Duplicates Value Found',icon='πŸ”₯')
105
+
106
+ if duplicates > 0 and st.checkbox('Remove Duplicates'):
107
+ data = data.drop_duplicates()
108
+ st.success('Duplicate rows removed!', icon='πŸ”₯')
109
+
110
+
111
+ # __________________ Value Count Section _____________________ #
112
+
113
+ st.subheader(':rainbow[Column Value Count]',divider='green')
114
+ with st.expander('Value Count'):
115
+ col1, col2 = st.columns(2)
116
+ with col1:
117
+ column = st.selectbox('Choose Column Name', options=[None] + data.columns.tolist())
118
+ with col2:
119
+ toprows = st.number_input('Number of Top Rows', min_value=1, step=1, value=5)
120
+
121
+ if column:
122
+ result = data[column].value_counts().reset_index().head(toprows)
123
+ result.columns = [column, 'count']
124
+ st.dataframe(result)
125
+
126
+ if not result.empty:
127
+ fig = px.bar(data_frame=result, x=column, y='count', template='plotly_white')
128
+ st.plotly_chart(fig)
129
+
130
+ fig = px.line(data_frame=result, x=column, y='count')
131
+ st.plotly_chart(fig)
132
+
133
+ fig = px.pie(data_frame=result, names=column, values='count')
134
+ st.plotly_chart(fig)
135
+
136
+ # ______________ GroupBy Section _________________________ #
137
+
138
+ st.subheader(':blue[Groupby : Simplify Your Data Analysis]',divider='violet')
139
+ st.write("Groupby allows you to summarize data by categories.")
140
+
141
+ with st.expander('Group By Your Columns'):
142
+ col1, col2, col3 = st.columns(3)
143
+
144
+ with col1:
145
+ groupby_cols = st.multiselect('Choose Columns to Group By', options=data.columns.tolist())
146
+
147
+ with col2:
148
+ operation_col = st.selectbox("Choose Column for Operation", options=data.columns.tolist())
149
+
150
+ with col3:
151
+ operation = st.selectbox("Choose Operation", options=['sum', 'max', 'min', 'count', 'mean', 'median'])
152
+
153
+ if groupby_cols and operation_col and operation:
154
+ result = data.groupby(groupby_cols).agg(newcol=(operation_col, operation)).reset_index()
155
+ st.dataframe(result)
156
+
157
+ st.subheader(':rainbow[Data Visualization]')
158
+ graph_type = st.selectbox('Choose Graph Type', options=['line', 'bar', 'scatter', 'pie', 'sunburst'])
159
+
160
+ if graph_type == 'line':
161
+ x_axis = st.selectbox('X Axis', options=result.columns.tolist())
162
+ y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
163
+ fig = px.line(data_frame=result, x=x_axis, y=y_axis)
164
+ st.plotly_chart(fig)
165
+
166
+ elif graph_type == 'bar':
167
+ x_axis = st.selectbox('X Axis', options=result.columns.tolist())
168
+ y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
169
+ color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
170
+ fig = px.bar(data_frame=result, x=x_axis, y=y_axis, color=color)
171
+ st.plotly_chart(fig)
172
+
173
+ elif graph_type == 'pie':
174
+ values = st.selectbox("Numerical Values", options=result.columns.tolist())
175
+ names = st.selectbox('Labels', options=result.columns.tolist())
176
+ fig = px.pie(data_frame=result, names=names, values=values)
177
+ st.plotly_chart(fig)
178
+
179
+ elif graph_type == 'scatter':
180
+ x_axis = st.selectbox('X Axis', options=result.columns.tolist())
181
+ y_axis = st.selectbox('Y Axis', options=result.columns.tolist())
182
+ size = st.selectbox('Size Column', options=[None] + result.columns.tolist())
183
+ color = st.selectbox('Color Information', options=[None] + result.columns.tolist())
184
+ fig = px.scatter(data_frame=result, x=x_axis, y=y_axis, color=color, size=size)
185
+ st.plotly_chart(fig)
186
+
187
+ elif graph_type == 'sunburst':
188
+ path = st.multiselect('Path', options=result.columns.tolist())
189
+ fig = px.sunburst(data_frame=result, path=path, values='newcol')
190
+ st.plotly_chart(fig)
191
+
192
+ #_________________ Machine Learning_______________ #
193
+
194
+ st.subheader(":orange[Basic Machine Learning]",divider='green')
195
+ ml_task = st.selectbox("Select ML Task", ["None", "SVM", "Logistic Regression", "Decision Tree", "K-Nearest Neighbors"])
196
+
197
+ if ml_task != "None":
198
+ target_col = st.selectbox("Select Target Column", data.columns)
199
+ feature_cols = st.multiselect("Select Feature Columns", data.columns)
200
+
201
+ if target_col and feature_cols:
202
+ X = data[feature_cols]
203
+ y = data[target_col]
204
+
205
+ # Handle Preprocessing (Categorical and Numeric Data)
206
+ numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
207
+ categorical_features = X.select_dtypes(include=['object']).columns
208
+
209
+ numeric_transformer = Pipeline(steps=[
210
+ ('imputer', SimpleImputer(strategy='mean')), # Handle missing data
211
+ ('scaler', StandardScaler()) # Normalize numerical data
212
+ ])
213
+
214
+ categorical_transformer = Pipeline(steps=[
215
+ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Handle missing data
216
+ ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-Hot Encode categorical features
217
+ ])
218
+
219
+ preprocessor = ColumnTransformer(
220
+ transformers=[
221
+ ('num', numeric_transformer, numeric_features),
222
+ ('cat', categorical_transformer, categorical_features)
223
+ ]
224
+ )
225
+
226
+ # Create model pipeline based on selected task
227
+ if ml_task == "SVM":
228
+ model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC())])
229
+ elif ml_task == "Logistic Regression":
230
+ model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])
231
+ elif ml_task == "Decision Tree":
232
+ model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())])
233
+ elif ml_task == "K-Nearest Neighbors":
234
+ model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', KNeighborsClassifier())])
235
+
236
+ # Split the data
237
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
238
+
239
+ # Train the model
240
+ model.fit(X_train, y_train)
241
+ y_pred = model.predict(X_test)
242
+
243
+ # Evaluate the model
244
+ accuracy = accuracy_score(y_test, y_pred)
245
+ st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
246
+
247
+ except Exception as e:
248
+ st.error(f"An error occurred: {e}")