krishaamer commited on
Commit
ba2cd93
·
1 Parent(s): 6e83e2f

Clarify several pages

Browse files
fields/investing_flat_fields.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ investing_flat_fields = [
2
+ "你/妳會對金錢感到焦慮嗎?",
3
+ "你/妳會對金錢很節儉嗎?",
4
+ "你/妳會經常存錢嗎?",
5
+ "你/妳對自己的財務知識滿意嗎?",
6
+ "你/妳投資會考慮環保嗎?"
7
+ ]
page_attitudes.py CHANGED
@@ -10,7 +10,7 @@ from fields.translation_mapping import translation_mapping
10
  @st.cache_data
11
  def show(df):
12
  st.title("Student Attitudes (Overall)")
13
- st.write("Student Attitudes across all Likert fields without clustering")
14
 
15
  # Chinese font
16
  chinese_font = FontProperties(fname='mingliu.ttf')
 
10
  @st.cache_data
11
  def show(df):
12
  st.title("Student Attitudes (Overall)")
13
+ st.write("Student attitudes across all likert fields without clustering")
14
 
15
  # Chinese font
16
  chinese_font = FontProperties(fname='mingliu.ttf')
page_home.py CHANGED
@@ -3,15 +3,16 @@ import streamlit as st
3
 
4
  def show():
5
  st.title("Survey Overview")
6
- st.markdown('''A survey of Taiwanese college students (excludes overseas Chinese-speaking students as well as foreign students).
7
- \n* Survey Oct. 13 - Nov. 13, 2023
8
  \n* 2000 cards with a QR code printed out
9
- \n* Distribution conducted in 8 universities (handing out the cards)
10
- \n* 1272 people started the survey, 507 quit
11
- \n* 764 people completed the whole survey
12
  \n* Data after filtering: 675 people aged 18-26 (Gen-Z), Taiwanese, current students in BA (large majority), MA (small minority) or PhD level (very few respondents)
13
- \n* 36 Likert fields used for clustering the students into 3 personas with K-means clustering.
14
- \n* 14 product features (multiple-choice)
15
  \n* 6 choice experiments
16
- \n* Text fields used to enrich the personas'''
 
17
  )
 
3
 
4
  def show():
5
  st.title("Survey Overview")
6
+ st.markdown('''A survey of Taiwanese college students (excludes overseas Chinese-speaking students as well as foreign students) covering attitudes towards shopping, saving, investing, economy, nature, sustainability, and AI.
7
+ \n* Survey Oct. 13th - Nov. 3rd, 2023
8
  \n* 2000 cards with a QR code printed out
9
+ \n* Distribution conducted at 8 universities (handing out the cards)
10
+ \n* 1289 people started the survey, 518 quit
11
+ \n* 771 people completed the whole survey
12
  \n* Data after filtering: 675 people aged 18-26 (Gen-Z), Taiwanese, current students in BA (large majority), MA (small minority) or PhD level (very few respondents)
13
+ \n* 36 likert fields (5-point scale) used for clustering the students into 3 personas with K-means clustering
14
+ \n* 14 product features (multiple-choice) used for K-modes clustering
15
  \n* 6 choice experiments
16
+ \n* 2 option ranking questions
17
+ \n* 10 text fields used to enrich the personas'''
18
  )
page_personas.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
  import squarify
@@ -11,36 +12,49 @@ from fields.likert_flat_fields import likert_flat_fields
11
 
12
  #@st.cache_data
13
  def show(df):
14
- st.title("Clustering Students to Build Personas")
15
- st.write("Clustering Students based on 36 fields of Likert data")
16
-
17
- st.title("Top 10 highest agreement between personas")
18
-
19
- create_treemap()
20
-
21
- st.title("Top 10 highest disagreement between Personas")
22
-
23
  # Chinese font
24
  chinese_font = FontProperties(fname='mingliu.ttf')
25
- perform_kmeans_clustering(df)
26
- show_clustering_heatmap(df, chinese_font)
27
-
28
  # Prepare the data and perform clustering and PCA
29
  df_clustered, pca, cluster_centers = prepare_data_for_pca(df)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # Show a scatterplot with all clusters included
32
- plot_scatterplot(df_clustered, pca, cluster_centers, chinese_font,
33
- title="Combined Clusters Scatterplot")
 
34
 
35
  # Show a scatterplot for each cluster separately
36
  for cluster_id in range(3):
37
  df_cluster = df_clustered[df_clustered['Cluster'] == cluster_id]
38
- plot_scatterplot(df_cluster, pca, cluster_centers, chinese_font,
39
- title=f"Scatterplot for Cluster {cluster_id}")
 
 
 
 
40
 
 
 
41
 
 
42
 
43
- def perform_kmeans_clustering(df):
44
 
45
  # Select only the relevant columns for clustering
46
  df_likert_real_data = df[likert_flat_fields]
@@ -59,20 +73,12 @@ def perform_kmeans_clustering(df):
59
  cluster_means_real_data = df_likert_real_data.groupby(
60
  'Cluster').mean().reset_index()
61
 
62
- # Count the number of students in each cluster
63
- cluster_counts = df_likert_real_data['Cluster'].value_counts(
64
- ).sort_index().reset_index()
65
- cluster_counts.columns = ['Cluster', 'Number of Students']
66
-
67
  # Display the table
68
- st.table(cluster_counts)
69
- st.write("Mean Scores for Each Question in Each Cluster:")
70
  st.table(cluster_means_real_data)
71
 
72
 
73
  def show_clustering_heatmap(df, chinese_font):
74
- st.title("Heatmap")
75
-
76
  # Filter the DataFrame to only include the Likert scale fields
77
  df_likert_data = df[likert_flat_fields]
78
 
@@ -100,9 +106,9 @@ def show_clustering_heatmap(df, chinese_font):
100
  ax.set_ylabel('Cluster ID', fontproperties=chinese_font)
101
 
102
  # Rotate the x-axis labels for better readability
103
- plt.setp(ax.get_xticklabels(), rotation=90)
 
104
 
105
- # Use the figure object (fig) in st.pyplot() to display the plot
106
  st.pyplot(fig)
107
 
108
 
@@ -125,32 +131,49 @@ def prepare_data_for_pca(df):
125
  return df_clustered, pca, cluster_centers
126
 
127
 
128
- def plot_scatterplot(df, pca, cluster_centers, chinese_font, title):
129
  # Create a figure and a set of subplots
130
  fig, ax = plt.subplots(figsize=(10, 10))
131
 
132
- # Plot the scatterplot on the created axes ax
133
- sns.scatterplot(x='Component_1', y='Component_2', hue='Cluster',
134
- data=df, palette='viridis', s=100, alpha=0.6, ax=ax)
135
 
136
- if 'Cluster' in df:
137
- # Add the cluster centers for all clusters if plotting combined scatterplot
138
- ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='red', s=200,
139
- alpha=0.75, marker='o', edgecolors='k')
140
- else:
141
- # Add the cluster center for the specified cluster if plotting individual scatterplot
142
- center = cluster_centers[df['Cluster'].unique()[0]]
143
- ax.scatter(center[0], center[1], c='red', s=200,
 
 
 
 
144
  alpha=0.75, marker='o', edgecolors='k')
 
 
 
145
 
 
146
  ax.set_title(title, fontproperties=chinese_font)
147
- ax.set_xlabel('Principal Component 1')
148
- ax.set_ylabel('Principal Component 2')
149
 
150
- # Use the figure object (fig) in st.pyplot() to display the plot
 
 
 
 
 
 
 
 
151
  st.pyplot(fig)
152
 
153
 
 
 
154
  def create_treemap():
155
  categories = {
156
  'Ethical Consumption and Labor Concerns': 3.2,
@@ -181,7 +204,7 @@ def create_treemap():
181
  ax.axis('off')
182
 
183
  # Add a title to the plot
184
- plt.title('Treemap of Survey Question Categories by Average Agreement Level', fontsize=15)
185
 
186
  # Use the figure object (fig) in st.pyplot() to display the plot
187
  st.pyplot(fig)
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ import textwrap
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  import squarify
 
12
 
13
  #@st.cache_data
14
  def show(df):
 
 
 
 
 
 
 
 
 
15
  # Chinese font
16
  chinese_font = FontProperties(fname='mingliu.ttf')
17
+
 
 
18
  # Prepare the data and perform clustering and PCA
19
  df_clustered, pca, cluster_centers = prepare_data_for_pca(df)
20
 
21
+ # Titles
22
+ st.title("Personas")
23
+ st.write("Based on 36 fields of likert data")
24
+
25
+ # Retain colors
26
+ unique_clusters = df_clustered['Cluster'].unique()
27
+ palette = sns.color_palette('pastel', n_colors=len(unique_clusters))
28
+ cluster_palette = {cluster: color for cluster, color in zip(unique_clusters, palette)}
29
+
30
+ # Cluster names
31
+ cluster_names = {
32
+ 0: 'Sustainable',
33
+ 1: 'Moderate',
34
+ 2: 'Frugal',
35
+ }
36
+
37
  # Show a scatterplot with all clusters included
38
+ st.markdown(
39
+ f"<h2 style='text-align: center;'>Clustering Students to Build 3 Personas</h2>", unsafe_allow_html=True)
40
+ plot_scatterplot(df_clustered, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, "Distinct Respondent Profiles Based on K-means Clustering")
41
 
42
  # Show a scatterplot for each cluster separately
43
  for cluster_id in range(3):
44
  df_cluster = df_clustered[df_clustered['Cluster'] == cluster_id]
45
+ plot_scatterplot(df_cluster, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, title=f"Scatterplot for Cluster {cluster_id}")
46
+
47
+ st.markdown(
48
+ f"<h2 style='text-align: center;'>Mean Answer Scores</h2>", unsafe_allow_html=True)
49
+ get_kmeans_table(df)
50
+ show_clustering_heatmap(df, chinese_font)
51
 
52
+ st.markdown(
53
+ f"<h2 style='text-align: center;'>Agreement between personas</h2>", unsafe_allow_html=True)
54
 
55
+ create_treemap()
56
 
57
+ def get_kmeans_table(df):
58
 
59
  # Select only the relevant columns for clustering
60
  df_likert_real_data = df[likert_flat_fields]
 
73
  cluster_means_real_data = df_likert_real_data.groupby(
74
  'Cluster').mean().reset_index()
75
 
 
 
 
 
 
76
  # Display the table
77
+ st.write("Mean response values for each likert question in each cluster:")
 
78
  st.table(cluster_means_real_data)
79
 
80
 
81
  def show_clustering_heatmap(df, chinese_font):
 
 
82
  # Filter the DataFrame to only include the Likert scale fields
83
  df_likert_data = df[likert_flat_fields]
84
 
 
106
  ax.set_ylabel('Cluster ID', fontproperties=chinese_font)
107
 
108
  # Rotate the x-axis labels for better readability
109
+ wrapped_labels = [textwrap.fill(label.get_text(), width=10) for label in ax.get_xticklabels()]
110
+ ax.set_xticklabels(wrapped_labels, rotation=45, fontproperties=chinese_font)
111
 
 
112
  st.pyplot(fig)
113
 
114
 
 
131
  return df_clustered, pca, cluster_centers
132
 
133
 
134
+ def plot_scatterplot(df, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, title):
135
  # Create a figure and a set of subplots
136
  fig, ax = plt.subplots(figsize=(10, 10))
137
 
138
+ # Calculate cluster counts
139
+ cluster_counts = df['Cluster'].value_counts()
 
140
 
141
+ # Plot the scatterplot
142
+ scatter = sns.scatterplot(x='Component_1', y='Component_2', hue='Cluster',
143
+ data=df, palette=cluster_palette, s=100, alpha=0.6, ax=ax)
144
+
145
+ # Get unique cluster labels sorted by value
146
+ unique_clusters = sorted(df['Cluster'].unique())
147
+
148
+ # Add the cluster centers for all clusters if plotting combined scatterplot
149
+ for label in unique_clusters:
150
+ # Use the label to index cluster_centers directly if it's a dictionary
151
+ center = cluster_centers[label]
152
+ ax.scatter(center[0], center[1], c=cluster_palette[label], s=200,
153
  alpha=0.75, marker='o', edgecolors='k')
154
+ # Annotate the number of respondents in the cluster
155
+ ax.text(center[0], center[1], str(cluster_counts[label]), color='black',
156
+ ha='center', va='center', fontproperties=chinese_font)
157
 
158
+ # Set titles and labels
159
  ax.set_title(title, fontproperties=chinese_font)
160
+ ax.set_xlabel('Principal Component 1', fontproperties=chinese_font)
161
+ ax.set_ylabel('Principal Component 2', fontproperties=chinese_font)
162
 
163
+ # Extract handles and labels from the scatterplot
164
+ handles, labels = scatter.get_legend_handles_labels()
165
+
166
+ # Update labels with custom names and counts
167
+ new_labels = [f'Cluster {label}: {cluster_names[label]} (n={cluster_counts[label]})' for label in unique_clusters]
168
+ # Update the legend with the new labels
169
+ ax.legend(handles=handles, labels=new_labels, title='Personas', loc='upper right')
170
+
171
+ # Use the figure object (fig) to display the plot
172
  st.pyplot(fig)
173
 
174
 
175
+
176
+
177
  def create_treemap():
178
  categories = {
179
  'Ethical Consumption and Labor Concerns': 3.2,
 
204
  ax.axis('off')
205
 
206
  # Add a title to the plot
207
+ plt.title('Average Agreement Level by Question Category', fontsize=15)
208
 
209
  # Use the figure object (fig) in st.pyplot() to display the plot
210
  st.pyplot(fig)
page_shopping.py CHANGED
@@ -13,14 +13,15 @@ def show(df):
13
  # Load the Chinese font
14
  chinese_font = FontProperties(fname='mingliu.ttf', size=12)
15
  st.title("Shopping")
16
- st.write("Clustering Students based on Product Feature choices")
17
- st.title("Boycott Count")
18
- show_boycott_count(df, font_prop=chinese_font)
19
  clusters = perform_kmodes_clustering(df, prod_feat_flat_fields)
20
- st.title("Feature Preferences")
 
21
  show_radar_chart(clusters, font_prop=chinese_font)
22
- st.title("Feature Preferences")
23
  plot_feature_preferences(clusters, font_prop=chinese_font)
 
 
 
24
 
25
  def show_boycott_count(df, font_prop):
26
  # Count the number of people who have invested and who have not
@@ -75,9 +76,9 @@ def perform_kmodes_clustering(df, feature_columns, n_clusters=3):
75
  def show_radar_chart(clusters, font_prop):
76
 
77
  df_dict={
78
- 'Eco-conscious Shopper (n=340)': clusters[0],
79
- 'Casual Eco-Interested)': clusters[1],
80
- 'Eco Advocate (n=126)': clusters[2]
81
  }
82
 
83
  feature_translations_dict = dict(zip(prod_feat_flat_fields, feature_translations))
@@ -146,9 +147,9 @@ def plot_feature_preferences(clusters, font_prop):
146
  "老實說我對任何環保資訊都沒有太多興趣\nHonestly, I'm Not Very Interested in Any Eco Information",
147
  "投資前比較公司的環保表現\nCompare Companies' Environmental Performance Before Investing"
148
  ],
149
- 'Eco-conscious Shopper (n=340)': [0.367, 0.415, 0.191, 0.176, 0.079, 1.000, 0.197, 0.265, 0.144, 0.241, 0.144, 0.332, 0.044, 0.188],
150
- 'Casual Eco-Interested (n=215)': [0.260, 0.163, 0.153, 0.191, 0.107, 0.000, 0.135, 0.219, 0.172, 0.186, 0.093, 0.214, 0.233, 0.130],
151
- 'Eco Advocate (n=126)': [0.825, 0.881, 0.460, 0.746, 0.230, 0.881, 0.667, 0.690, 0.421, 0.865, 0.468, 0.778, 0.143, 0.738]
152
  }
153
  # Create a DataFrame
154
  df = pd.DataFrame(data)
 
13
  # Load the Chinese font
14
  chinese_font = FontProperties(fname='mingliu.ttf', size=12)
15
  st.title("Shopping")
16
+ st.write("Clustering students based on AI-assistant feature choices")
 
 
17
  clusters = perform_kmodes_clustering(df, prod_feat_flat_fields)
18
+ st.markdown(
19
+ f"<h2 style='text-align: center;'>Feature Preferences</h2>", unsafe_allow_html=True)
20
  show_radar_chart(clusters, font_prop=chinese_font)
 
21
  plot_feature_preferences(clusters, font_prop=chinese_font)
22
+ st.markdown(
23
+ f"<h2 style='text-align: center;'>Boycott Count</h2>", unsafe_allow_html=True)
24
+ show_boycott_count(df, font_prop=chinese_font)
25
 
26
  def show_boycott_count(df, font_prop):
27
  # Count the number of people who have invested and who have not
 
76
  def show_radar_chart(clusters, font_prop):
77
 
78
  df_dict={
79
+ 'Conscious (n=340)': clusters[0],
80
+ 'Interested (n=215)': clusters[1],
81
+ 'Advocate (n=126)': clusters[2]
82
  }
83
 
84
  feature_translations_dict = dict(zip(prod_feat_flat_fields, feature_translations))
 
147
  "老實說我對任何環保資訊都沒有太多興趣\nHonestly, I'm Not Very Interested in Any Eco Information",
148
  "投資前比較公司的環保表現\nCompare Companies' Environmental Performance Before Investing"
149
  ],
150
+ 'Conscious (n=340)': [0.367, 0.415, 0.191, 0.176, 0.079, 1.000, 0.197, 0.265, 0.144, 0.241, 0.144, 0.332, 0.044, 0.188],
151
+ 'Interested (n=215)': [0.260, 0.163, 0.153, 0.191, 0.107, 0.000, 0.135, 0.219, 0.172, 0.186, 0.093, 0.214, 0.233, 0.130],
152
+ 'Advocate (n=126)': [0.825, 0.881, 0.460, 0.746, 0.230, 0.881, 0.667, 0.690, 0.421, 0.865, 0.468, 0.778, 0.143, 0.738]
153
  }
154
  # Create a DataFrame
155
  df = pd.DataFrame(data)
page_tests.py CHANGED
@@ -1,11 +1,13 @@
1
  from scipy.stats import chisquare
 
2
  import streamlit as st
3
  import pandas as pd
4
  from fields.likert_flat_fields import likert_flat_fields
5
 
6
- @st.cache_data
7
  def show(df):
8
  st.title("Statistical Tests")
 
9
  show_chi_square_results(df)
10
 
11
  def show_chi_square_results(df):
@@ -13,40 +15,46 @@ def show_chi_square_results(df):
13
 
14
  for field in likert_flat_fields:
15
  observed_values = df[field].value_counts().sort_index()
16
- observed_values = observed_values.astype(float)
17
  expected_values = [len(df) / len(observed_values)] * len(observed_values)
18
- expected_values = [float(x) for x in expected_values]
19
  chi_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
20
  chi_square_results[field] = {'Chi-Square Statistic': chi_stat, 'p-value': p_value}
21
 
22
  chi_square_df = pd.DataFrame.from_dict(chi_square_results, orient='index')
23
  chi_square_df['p-value'] = chi_square_df['p-value'].astype(float)
24
 
 
 
 
25
  # Reset index to add a sequence number
26
  chi_square_df.reset_index(inplace=True)
27
  chi_square_df.rename(columns={'index': 'Question'}, inplace=True)
28
 
29
  # Define thresholds for highlighting
30
  chi_square_threshold = 300 # example threshold for high Chi-Square value
31
- p_value_threshold = 1e-50 # example threshold for very low p-value
32
 
33
  # Apply the highlighting
34
- def highlight(value):
35
- if isinstance(value, float) and value > chi_square_threshold:
36
- return "background-color: yellow"
37
- elif isinstance(value, float) and value < p_value_threshold:
38
  return "background-color: yellow"
 
 
39
  else:
40
- return ""
41
-
42
- # Apply the highlighting to numeric columns only
43
- chi_square_df_styled = chi_square_df.style.applymap(highlight, subset=pd.IndexSlice[:, ['Chi-Square Statistic', 'p-value']])
44
 
45
- # Convert p-values to string after highlighting
46
- chi_square_df['p-value'] = chi_square_df['p-value'].apply(lambda x: "{:.2e}".format(x))
 
 
 
 
 
 
 
47
 
48
- # Convert styled DataFrame to HTML after p-value formatting
49
  chi_square_html = chi_square_df_styled.to_html(escape=False)
50
 
51
  # Display the HTML with unsafe_allow_html set to True
52
- st.markdown(chi_square_html, unsafe_allow_html=True)
 
1
  from scipy.stats import chisquare
2
+ from functools import partial
3
  import streamlit as st
4
  import pandas as pd
5
  from fields.likert_flat_fields import likert_flat_fields
6
 
7
+ #@st.cache_data
8
  def show(df):
9
  st.title("Statistical Tests")
10
+ st.write("Yellow Chi-Square statistics (high) and pink p-values (low) are statistically meaningful")
11
  show_chi_square_results(df)
12
 
13
  def show_chi_square_results(df):
 
15
 
16
  for field in likert_flat_fields:
17
  observed_values = df[field].value_counts().sort_index()
18
+ observed_values = observed_values.reindex(index=range(1, 6), fill_value=0)
19
  expected_values = [len(df) / len(observed_values)] * len(observed_values)
 
20
  chi_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
21
  chi_square_results[field] = {'Chi-Square Statistic': chi_stat, 'p-value': p_value}
22
 
23
  chi_square_df = pd.DataFrame.from_dict(chi_square_results, orient='index')
24
  chi_square_df['p-value'] = chi_square_df['p-value'].astype(float)
25
 
26
+ # Convert p-values to string for formatting
27
+ chi_square_df['p-value'] = chi_square_df['p-value'].apply(lambda x: "{:.2e}".format(x))
28
+
29
  # Reset index to add a sequence number
30
  chi_square_df.reset_index(inplace=True)
31
  chi_square_df.rename(columns={'index': 'Question'}, inplace=True)
32
 
33
  # Define thresholds for highlighting
34
  chi_square_threshold = 300 # example threshold for high Chi-Square value
35
+ p_value_threshold = 1e-50 # example threshold for very low p-value
36
 
37
  # Apply the highlighting
38
+ def highlight(value, chi_square_threshold, p_value_threshold, col_name):
39
+ if col_name == 'Chi-Square Statistic' and float(value) > chi_square_threshold:
 
 
40
  return "background-color: yellow"
41
+ elif col_name == 'p-value' and float(value) < p_value_threshold:
42
+ return "background-color: pink"
43
  else:
44
+ return None
 
 
 
45
 
46
+ # Create partial functions for each column to apply the highlight with the column name
47
+ highlight_chi_square = partial(highlight, chi_square_threshold=chi_square_threshold,
48
+ p_value_threshold=p_value_threshold, col_name='Chi-Square Statistic')
49
+ highlight_p_value = partial(highlight, chi_square_threshold=chi_square_threshold,
50
+ p_value_threshold=p_value_threshold, col_name='p-value')
51
+
52
+ # Apply the highlighting to the DataFrame
53
+ chi_square_df_styled = chi_square_df.style.applymap(highlight_chi_square, subset=['Chi-Square Statistic']) \
54
+ .applymap(highlight_p_value, subset=['p-value'])
55
 
56
+ # Convert styled DataFrame to HTML
57
  chi_square_html = chi_square_df_styled.to_html(escape=False)
58
 
59
  # Display the HTML with unsafe_allow_html set to True
60
+ st.markdown(chi_square_html, unsafe_allow_html=True)