Spaces:

abdellatif-laghjaj
/

student-behavior-clustering

Running

App Files Files Community

student-behavior-clustering / app_clustering.py

abdellatif-laghjaj

Upload 2 files

9d388a8 verified about 1 year ago

raw

history blame

9.16 kB

	import streamlit as st
	from streamlit_option_menu import option_menu
	import pandas as pd
	from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
	import plotly.express as px

	st.set_page_config(layout="wide")

	st.title("Student Behavior Clustering 📊✨")
	st.write("This app performs clustering on student behavior data to identify patterns and segments of students.")

	# Two option menus: App, About
	tabs = ["App", "About"]
	app_mode = option_menu(None, options=tabs, icons=["📊", "❓"], default_index=0, orientation="horizontal")

	# --- Sidebar for Settings and File Upload ---
	st.sidebar.header("Data and Clustering Settings")

	# File Upload
	uploaded_file = st.sidebar.file_uploader(
	"Choose a CSV file or use default:", type=["csv"]
	)

	# Use a default dataset if no file is uploaded
	if uploaded_file is None:
	df = pd.read_csv("clustering_data.csv")
	else:
	df = pd.read_csv(uploaded_file)

	# --- Data Preprocessing (Example: Handling Missing Values) ---
	# Replace this with your specific data cleaning needs
	df.fillna(df.mean(), inplace=True)

	# --- Feature Engineering (Example) ---
	df['engagement_score'] = (
	df['attendance_rate'] * 0.5 +
	df['test_average'] * 0.5
	)

	# Select features for clustering
	features = df[['attendance_rate', 'test_average', 'engagement_score']]

	# Standard Scaling
	scaler = StandardScaler()
	scaled_features = scaler.fit_transform(features)

	# Sidebar for Algorithm Selection and Parameter Tuning
	st.sidebar.header("Clustering Settings")
	algorithm = st.sidebar.selectbox(
	"Select Algorithm:",
	("KMeans", "DBSCAN", "Hierarchical")
	)

	# Default values for parameters
	n_clusters_kmeans = 3
	eps = 0.5
	min_samples = 5
	n_clusters_hierarchical = 3
	linkage = 'ward'

	# Parameter tuning section
	with st.sidebar.expander("Algorithm Parameters"):
	if algorithm == "KMeans":
	n_clusters_kmeans = st.slider(
	"Number of Clusters (K)", 2, 10, 3,
	help="Number of clusters to form for KMeans."
	)
	elif algorithm == "DBSCAN":
	eps = st.slider(
	"Epsilon (eps)", 0.1, 2.0, 0.5, 0.1,
	help="Maximum distance between two samples for one to be considered as in the neighborhood of the other for DBSCAN."
	)
	min_samples = st.slider(
	"Min Samples", 2, 10, 5,
	help="The number of samples in a neighborhood for a point to be considered as a core point for DBSCAN."
	)
	else: # Hierarchical
	n_clusters_hierarchical = st.slider(
	"Number of Clusters", 2, 10, 3,
	help="Number of clusters to find for hierarchical clustering."
	)
	linkage = st.selectbox(
	"Linkage", ['ward', 'complete', 'average', 'single'],
	help="Which linkage criterion to use for hierarchical clustering."
	)


	# Function to perform clustering
	def cluster_data(algo_name, **kwargs):
	try:
	if algo_name == "KMeans":
	model = KMeans(n_clusters=kwargs.get('n_clusters', 3), random_state=42)
	elif algo_name == "DBSCAN":
	model = DBSCAN(eps=kwargs.get('eps', 0.5), min_samples=kwargs.get('min_samples', 5))
	else: # Hierarchical
	model = AgglomerativeClustering(
	n_clusters=kwargs.get('n_clusters', 3),
	linkage=kwargs.get('linkage', 'ward')
	)

	clusters = model.fit_predict(scaled_features)
	return clusters

	except Exception as e:
	st.error(f"An error occurred during clustering: {e}")
	return None


	# Perform clustering
	clusters = cluster_data(
	algorithm,
	n_clusters=n_clusters_kmeans if algorithm == "KMeans" else n_clusters_hierarchical,
	eps=eps if algorithm == "DBSCAN" else 0.5,
	min_samples=min_samples if algorithm == "DBSCAN" else 5,
	linkage=linkage if algorithm == "Hierarchical" else "ward",
	)

	# THE APP CONTENT
	if app_mode == "About":
	st.write(
	"""
	## About
	This app performs clustering on student behavior data to identify patterns and segments of students.

	### Data
	The dataset contains student information such as attendance rate, test average, and engagement score.

	### Clustering Algorithms
	- KMeans: Partitions data into K clusters based on feature similarity.
	- DBSCAN: Density-based clustering to identify outliers and clusters of varying shapes.
	- Hierarchical: Builds a tree of clusters to identify subgroups.

	### Evaluation Metrics
	- Silhouette Score: Measures how similar an object is to its cluster compared to other clusters.
	- Davies-Bouldin Index: Computes the average similarity between each cluster and its most similar one.
	- Calinski-Harabasz Index: Ratio of the sum of between-clusters dispersion and within-cluster dispersion.

	### Cluster Profiling
	- Parallel coordinates plot to visualize and compare clusters across multiple features.

	### Interpretation of Clusters
	- Provides insights into each cluster based on the average values of features.
	"""
	)
	st.write(
	"""
	## How to Use
	1. Upload Data: Upload your own CSV file or use the default dataset.
	2. Select Algorithm: Choose between KMeans, DBSCAN, and Hierarchical clustering.
	3. Set Parameters: Adjust the clustering parameters in the sidebar.
	4. Interpret Results: Explore the clustered data, evaluation metrics, and cluster profiles.
	"""
	)
	st.write(
	"""
	## Contact
	If you have any questions or feedback, feel free to connect with me on:
	- [LinkedIn](https://www.linkedin.com/in/abdellatif-laghjaj)
	- [GitHub](https://www.github.com/abdellatif-laghjaj)
	"""
	)

	elif app_mode == "App":
	if clusters is not None:
	df['cluster'] = clusters

	# --- Display Clustered Data ---
	st.subheader(f"Clustered Data using {algorithm}:")
	st.dataframe(df)

	# --- Evaluation Metrics ---
	if len(set(clusters)) > 1:
	silhouette_avg = silhouette_score(scaled_features, clusters)
	db_index = davies_bouldin_score(scaled_features, clusters)
	ch_index = calinski_harabasz_score(scaled_features, clusters)

	st.subheader("Clustering Evaluation Metrics")
	st.markdown(f"Silhouette Score: {silhouette_avg:.2f}", unsafe_allow_html=True)
	st.markdown(f"Davies-Bouldin Index: {db_index:.2f}", unsafe_allow_html=True)
	st.markdown(f"Calinski-Harabasz Index: {ch_index:.2f}", unsafe_allow_html=True)
	else:
	st.warning("Evaluation metrics are not applicable. Only one cluster found.")

	# --- Interactive 3D Scatter Plot with Plotly ---
	st.subheader("Interactive 3D Cluster Visualization")
	fig = px.scatter_3d(
	df,
	x='attendance_rate',
	y='test_average',
	z='engagement_score',
	color='cluster',
	title=f"Student Clusters ({algorithm})",
	labels={'attendance_rate': 'Attendance Rate',
	'test_average': 'Test Average',
	'engagement_score': 'Engagement Score'}
	)
	st.plotly_chart(fig)

	# --- Cluster Profiling (Example using Plotly) ---
	st.subheader("Cluster Profile Visualization")
	st.write("The parallel coordinates plot is a way to visualize and compare clusters across multiple features.")
	profile_features = ['attendance_rate', 'test_average', 'engagement_score']
	cluster_means = df.groupby('cluster')[profile_features].mean().reset_index()

	fig_profile = px.parallel_coordinates(
	cluster_means,
	color='cluster',
	dimensions=profile_features,
	title="Parallel Coordinates Plot for Cluster Profiles"
	)
	st.plotly_chart(fig_profile)

	# --- Dynamic Interpretation of Clusters ---
	st.subheader("Interpretation of Clusters")
	for cluster_num in cluster_means['cluster']:
	cluster_data = cluster_means[cluster_means['cluster'] == cluster_num]
	st.write(f"Cluster {cluster_num}:")
	for feature in profile_features:
	st.write(f"- {feature.replace('_', ' ').title()}: {cluster_data[feature].values[0]:.2f}")

	highest_feature = cluster_data[profile_features].idxmax(axis=1).values[0]
	lowest_feature = cluster_data[profile_features].idxmin(axis=1).values[0]

	st.write(f"This cluster has the highest average {highest_feature.replace('_', ' ')} "
	f"and the lowest average {lowest_feature.replace('_', ' ')}.")
	st.write("---")

	# Additional insights based on cluster characteristics can be added here.
	else:
	st.warning("Please configure the clustering settings and run the algorithm first.")