|
import streamlit as st |
|
from streamlit_option_menu import option_menu |
|
import pandas as pd |
|
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score |
|
import plotly.express as px |
|
|
|
st.set_page_config(layout="wide") |
|
|
|
st.title("Student Behavior Clustering πβ¨") |
|
st.write("This app performs clustering on student behavior data to identify patterns and segments of students.") |
|
|
|
|
|
tabs = ["App", "About"] |
|
app_mode = option_menu(None, options=tabs, icons=["π", "β"], default_index=0, orientation="horizontal") |
|
|
|
|
|
st.sidebar.header("Data and Clustering Settings") |
|
|
|
|
|
uploaded_file = st.sidebar.file_uploader( |
|
"Choose a CSV file or use default:", type=["csv"] |
|
) |
|
|
|
|
|
if uploaded_file is None: |
|
df = pd.read_csv("clustering_data.csv") |
|
else: |
|
df = pd.read_csv(uploaded_file) |
|
|
|
|
|
|
|
df.fillna(df.mean(), inplace=True) |
|
|
|
|
|
df['engagement_score'] = ( |
|
df['attendance_rate'] * 0.5 + |
|
df['test_average'] * 0.5 |
|
) |
|
|
|
|
|
features = df[['attendance_rate', 'test_average', 'engagement_score']] |
|
|
|
|
|
scaler = StandardScaler() |
|
scaled_features = scaler.fit_transform(features) |
|
|
|
|
|
st.sidebar.header("Clustering Settings") |
|
algorithm = st.sidebar.selectbox( |
|
"Select Algorithm:", |
|
("KMeans", "DBSCAN", "Hierarchical") |
|
) |
|
|
|
|
|
n_clusters_kmeans = 3 |
|
eps = 0.5 |
|
min_samples = 5 |
|
n_clusters_hierarchical = 3 |
|
linkage = 'ward' |
|
|
|
|
|
with st.sidebar.expander("Algorithm Parameters"): |
|
if algorithm == "KMeans": |
|
n_clusters_kmeans = st.slider( |
|
"Number of Clusters (K)", 2, 10, 3, |
|
help="Number of clusters to form for KMeans." |
|
) |
|
elif algorithm == "DBSCAN": |
|
eps = st.slider( |
|
"Epsilon (eps)", 0.1, 2.0, 0.5, 0.1, |
|
help="Maximum distance between two samples for one to be considered as in the neighborhood of the other for DBSCAN." |
|
) |
|
min_samples = st.slider( |
|
"Min Samples", 2, 10, 5, |
|
help="The number of samples in a neighborhood for a point to be considered as a core point for DBSCAN." |
|
) |
|
else: |
|
n_clusters_hierarchical = st.slider( |
|
"Number of Clusters", 2, 10, 3, |
|
help="Number of clusters to find for hierarchical clustering." |
|
) |
|
linkage = st.selectbox( |
|
"Linkage", ['ward', 'complete', 'average', 'single'], |
|
help="Which linkage criterion to use for hierarchical clustering." |
|
) |
|
|
|
|
|
|
|
def cluster_data(algo_name, **kwargs): |
|
try: |
|
if algo_name == "KMeans": |
|
model = KMeans(n_clusters=kwargs.get('n_clusters', 3), random_state=42) |
|
elif algo_name == "DBSCAN": |
|
model = DBSCAN(eps=kwargs.get('eps', 0.5), min_samples=kwargs.get('min_samples', 5)) |
|
else: |
|
model = AgglomerativeClustering( |
|
n_clusters=kwargs.get('n_clusters', 3), |
|
linkage=kwargs.get('linkage', 'ward') |
|
) |
|
|
|
clusters = model.fit_predict(scaled_features) |
|
return clusters |
|
|
|
except Exception as e: |
|
st.error(f"An error occurred during clustering: {e}") |
|
return None |
|
|
|
|
|
|
|
clusters = cluster_data( |
|
algorithm, |
|
n_clusters=n_clusters_kmeans if algorithm == "KMeans" else n_clusters_hierarchical, |
|
eps=eps if algorithm == "DBSCAN" else 0.5, |
|
min_samples=min_samples if algorithm == "DBSCAN" else 5, |
|
linkage=linkage if algorithm == "Hierarchical" else "ward", |
|
) |
|
|
|
|
|
if app_mode == "About": |
|
st.write( |
|
""" |
|
## About |
|
This app performs clustering on student behavior data to identify patterns and segments of students. |
|
|
|
### Data |
|
The dataset contains student information such as attendance rate, test average, and engagement score. |
|
|
|
### Clustering Algorithms |
|
- **KMeans:** Partitions data into K clusters based on feature similarity. |
|
- **DBSCAN:** Density-based clustering to identify outliers and clusters of varying shapes. |
|
- **Hierarchical:** Builds a tree of clusters to identify subgroups. |
|
|
|
### Evaluation Metrics |
|
- **Silhouette Score:** Measures how similar an object is to its cluster compared to other clusters. |
|
- **Davies-Bouldin Index:** Computes the average similarity between each cluster and its most similar one. |
|
- **Calinski-Harabasz Index:** Ratio of the sum of between-clusters dispersion and within-cluster dispersion. |
|
|
|
### Cluster Profiling |
|
- Parallel coordinates plot to visualize and compare clusters across multiple features. |
|
|
|
### Interpretation of Clusters |
|
- Provides insights into each cluster based on the average values of features. |
|
""" |
|
) |
|
st.write( |
|
""" |
|
## How to Use |
|
1. **Upload Data:** Upload your own CSV file or use the default dataset. |
|
2. **Select Algorithm:** Choose between KMeans, DBSCAN, and Hierarchical clustering. |
|
3. **Set Parameters:** Adjust the clustering parameters in the sidebar. |
|
4. **Interpret Results:** Explore the clustered data, evaluation metrics, and cluster profiles. |
|
""" |
|
) |
|
st.write( |
|
""" |
|
## Contact |
|
If you have any questions or feedback, feel free to connect with me on: |
|
- [LinkedIn](https://www.linkedin.com/in/abdellatif-laghjaj) |
|
- [GitHub](https://www.github.com/abdellatif-laghjaj) |
|
""" |
|
) |
|
|
|
elif app_mode == "App": |
|
if clusters is not None: |
|
df['cluster'] = clusters |
|
|
|
|
|
st.subheader(f"Clustered Data using {algorithm}:") |
|
st.dataframe(df) |
|
|
|
|
|
if len(set(clusters)) > 1: |
|
silhouette_avg = silhouette_score(scaled_features, clusters) |
|
db_index = davies_bouldin_score(scaled_features, clusters) |
|
ch_index = calinski_harabasz_score(scaled_features, clusters) |
|
|
|
st.subheader("Clustering Evaluation Metrics") |
|
st.markdown(f"**Silhouette Score:** {silhouette_avg:.2f}", unsafe_allow_html=True) |
|
st.markdown(f"**Davies-Bouldin Index:** {db_index:.2f}", unsafe_allow_html=True) |
|
st.markdown(f"**Calinski-Harabasz Index:** {ch_index:.2f}", unsafe_allow_html=True) |
|
else: |
|
st.warning("Evaluation metrics are not applicable. Only one cluster found.") |
|
|
|
|
|
st.subheader("Interactive 3D Cluster Visualization") |
|
fig = px.scatter_3d( |
|
df, |
|
x='attendance_rate', |
|
y='test_average', |
|
z='engagement_score', |
|
color='cluster', |
|
title=f"Student Clusters ({algorithm})", |
|
labels={'attendance_rate': 'Attendance Rate', |
|
'test_average': 'Test Average', |
|
'engagement_score': 'Engagement Score'} |
|
) |
|
st.plotly_chart(fig) |
|
|
|
|
|
st.subheader("Cluster Profile Visualization") |
|
st.write("The parallel coordinates plot is a way to visualize and compare clusters across multiple features.") |
|
profile_features = ['attendance_rate', 'test_average', 'engagement_score'] |
|
cluster_means = df.groupby('cluster')[profile_features].mean().reset_index() |
|
|
|
fig_profile = px.parallel_coordinates( |
|
cluster_means, |
|
color='cluster', |
|
dimensions=profile_features, |
|
title="Parallel Coordinates Plot for Cluster Profiles" |
|
) |
|
st.plotly_chart(fig_profile) |
|
|
|
|
|
st.subheader("Interpretation of Clusters") |
|
for cluster_num in cluster_means['cluster']: |
|
cluster_data = cluster_means[cluster_means['cluster'] == cluster_num] |
|
st.write(f"**Cluster {cluster_num}:**") |
|
for feature in profile_features: |
|
st.write(f"- **{feature.replace('_', ' ').title()}:** {cluster_data[feature].values[0]:.2f}") |
|
|
|
highest_feature = cluster_data[profile_features].idxmax(axis=1).values[0] |
|
lowest_feature = cluster_data[profile_features].idxmin(axis=1).values[0] |
|
|
|
st.write(f"This cluster has the highest average {highest_feature.replace('_', ' ')} " |
|
f"and the lowest average {lowest_feature.replace('_', ' ')}.") |
|
st.write("---") |
|
|
|
|
|
else: |
|
st.warning("Please configure the clustering settings and run the algorithm first.") |
|
|