Spaces:

krishaamer
/

ziran

Sleeping

App Files Files Community

krishaamer commited on Nov 7, 2023

Commit

ba2cd93

1 Parent(s): 6e83e2f

Clarify several pages

Browse files

Files changed (6) hide show

fields/investing_flat_fields.py +7 -0
page_attitudes.py +1 -1
page_home.py +9 -8
page_personas.py +67 -44
page_shopping.py +12 -11
page_tests.py +24 -16

fields/investing_flat_fields.py ADDED Viewed

	@@ -0,0 +1,7 @@

+investing_flat_fields = [
+  "你/妳會對金錢感到焦慮嗎？",
+  "你/妳會對金錢很節儉嗎？",
+  "你/妳會經常存錢嗎？",
+  "你/妳對自己的財務知識滿意嗎？",
+  "你/妳投資會考慮環保嗎？"
+]

page_attitudes.py CHANGED Viewed

@@ -10,7 +10,7 @@ from fields.translation_mapping import translation_mapping
 @st.cache_data
 def show(df):
     st.title("Student Attitudes (Overall)")
-    st.write("Student Attitudes across all Likert fields without clustering")
     # Chinese font
     chinese_font = FontProperties(fname='mingliu.ttf')

 @st.cache_data
 def show(df):
     st.title("Student Attitudes (Overall)")
+    st.write("Student attitudes across all likert fields without clustering")
     # Chinese font
     chinese_font = FontProperties(fname='mingliu.ttf')

page_home.py CHANGED Viewed

@@ -3,15 +3,16 @@ import streamlit as st
 def show():
     st.title("Survey Overview")
-    st.markdown('''A survey of Taiwanese college students (excludes overseas Chinese-speaking students as well as foreign students).
-\n* Survey Oct. 13 - Nov. 13, 2023
 \n* 2000 cards with a QR code printed out
-\n* Distribution conducted in 8 universities (handing out the cards)
-\n* 1272 people started the survey, 507 quit
-\n* 764 people completed the whole survey
 \n* Data after filtering: 675 people aged 18-26 (Gen-Z), Taiwanese, current students in BA (large majority), MA (small minority) or PhD level (very few respondents)
-\n* 36 Likert fields used for clustering the students into 3 personas with K-means clustering.
-\n* 14 product features (multiple-choice)
 \n* 6 choice experiments
-\n* Text fields used to enrich the personas'''
                 )

 def show():
     st.title("Survey Overview")
+    st.markdown('''A survey of Taiwanese college students (excludes overseas Chinese-speaking students as well as foreign students) covering attitudes towards shopping, saving, investing, economy, nature, sustainability, and AI.
+\n* Survey Oct. 13th - Nov. 3rd, 2023
 \n* 2000 cards with a QR code printed out
+\n* Distribution conducted at 8 universities (handing out the cards)
+\n* 1289 people started the survey, 518 quit
+\n* 771 people completed the whole survey
 \n* Data after filtering: 675 people aged 18-26 (Gen-Z), Taiwanese, current students in BA (large majority), MA (small minority) or PhD level (very few respondents)
+\n* 36 likert fields (5-point scale) used for clustering the students into 3 personas with K-means clustering
+\n* 14 product features (multiple-choice) used for K-modes clustering
 \n* 6 choice experiments
+\n* 2 option ranking questions
+\n* 10 text fields used to enrich the personas'''
                 )

page_personas.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 import squarify
@@ -11,36 +12,49 @@ from fields.likert_flat_fields import likert_flat_fields
 #@st.cache_data
 def show(df):
-    st.title("Clustering Students to Build Personas")
-    st.write("Clustering Students based on 36 fields of Likert data")
-    st.title("Top 10 highest agreement between personas")
-    create_treemap()
-    st.title("Top 10 highest disagreement between Personas")
     # Chinese font
     chinese_font = FontProperties(fname='mingliu.ttf')
-    perform_kmeans_clustering(df)
-    show_clustering_heatmap(df, chinese_font)
     # Prepare the data and perform clustering and PCA
     df_clustered, pca, cluster_centers = prepare_data_for_pca(df)
     # Show a scatterplot with all clusters included
-    plot_scatterplot(df_clustered, pca, cluster_centers, chinese_font,
-                     title="Combined Clusters Scatterplot")
     # Show a scatterplot for each cluster separately
     for cluster_id in range(3):
         df_cluster = df_clustered[df_clustered['Cluster'] == cluster_id]
-        plot_scatterplot(df_cluster, pca, cluster_centers, chinese_font,
-                         title=f"Scatterplot for Cluster {cluster_id}")
-def perform_kmeans_clustering(df):
     # Select only the relevant columns for clustering
     df_likert_real_data = df[likert_flat_fields]
@@ -59,20 +73,12 @@ def perform_kmeans_clustering(df):
     cluster_means_real_data = df_likert_real_data.groupby(
         'Cluster').mean().reset_index()
-    # Count the number of students in each cluster
-    cluster_counts = df_likert_real_data['Cluster'].value_counts(
-    ).sort_index().reset_index()
-    cluster_counts.columns = ['Cluster', 'Number of Students']
     # Display the table
-    st.table(cluster_counts)
-    st.write("Mean Scores for Each Question in Each Cluster:")
     st.table(cluster_means_real_data)
 def show_clustering_heatmap(df, chinese_font):
-    st.title("Heatmap")
     # Filter the DataFrame to only include the Likert scale fields
     df_likert_data = df[likert_flat_fields]
@@ -100,9 +106,9 @@ def show_clustering_heatmap(df, chinese_font):
     ax.set_ylabel('Cluster ID', fontproperties=chinese_font)
     # Rotate the x-axis labels for better readability
-    plt.setp(ax.get_xticklabels(), rotation=90)
-    # Use the figure object (fig) in st.pyplot() to display the plot
     st.pyplot(fig)
@@ -125,32 +131,49 @@ def prepare_data_for_pca(df):
     return df_clustered, pca, cluster_centers
-def plot_scatterplot(df, pca, cluster_centers, chinese_font, title):
     # Create a figure and a set of subplots
     fig, ax = plt.subplots(figsize=(10, 10))
-    # Plot the scatterplot on the created axes ax
-    sns.scatterplot(x='Component_1', y='Component_2', hue='Cluster',
-                    data=df, palette='viridis', s=100, alpha=0.6, ax=ax)
-    if 'Cluster' in df:
-        # Add the cluster centers for all clusters if plotting combined scatterplot
-        ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='red', s=200,
-                   alpha=0.75, marker='o', edgecolors='k')
-    else:
-        # Add the cluster center for the specified cluster if plotting individual scatterplot
-        center = cluster_centers[df['Cluster'].unique()[0]]
-        ax.scatter(center[0], center[1], c='red', s=200,
                    alpha=0.75, marker='o', edgecolors='k')
     ax.set_title(title, fontproperties=chinese_font)
-    ax.set_xlabel('Principal Component 1')
-    ax.set_ylabel('Principal Component 2')
-    # Use the figure object (fig) in st.pyplot() to display the plot
     st.pyplot(fig)
 def create_treemap():
     categories = {
         'Ethical Consumption and Labor Concerns': 3.2,
@@ -181,7 +204,7 @@ def create_treemap():
     ax.axis('off')
     # Add a title to the plot
-    plt.title('Treemap of Survey Question Categories by Average Agreement Level', fontsize=15)
     # Use the figure object (fig) in st.pyplot() to display the plot
     st.pyplot(fig)

 import streamlit as st
 import pandas as pd
 import numpy as np
+import textwrap
 import matplotlib.pyplot as plt
 import seaborn as sns
 import squarify
 #@st.cache_data
 def show(df):
     # Chinese font
     chinese_font = FontProperties(fname='mingliu.ttf')
     # Prepare the data and perform clustering and PCA
     df_clustered, pca, cluster_centers = prepare_data_for_pca(df)
+    # Titles
+    st.title("Personas")
+    st.write("Based on 36 fields of likert data")
+    # Retain colors
+    unique_clusters = df_clustered['Cluster'].unique()
+    palette = sns.color_palette('pastel', n_colors=len(unique_clusters))
+    cluster_palette = {cluster: color for cluster, color in zip(unique_clusters, palette)}
+    # Cluster names
+    cluster_names = {
+        0: 'Sustainable',
+        1: 'Moderate',
+        2: 'Frugal',
+    }
     # Show a scatterplot with all clusters included
+    st.markdown(
+                f"<h2 style='text-align: center;'>Clustering Students to Build 3 Personas</h2>", unsafe_allow_html=True)
+    plot_scatterplot(df_clustered, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, "Distinct Respondent Profiles Based on K-means Clustering")
     # Show a scatterplot for each cluster separately
     for cluster_id in range(3):
         df_cluster = df_clustered[df_clustered['Cluster'] == cluster_id]
+        plot_scatterplot(df_cluster, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, title=f"Scatterplot for Cluster {cluster_id}")
+    st.markdown(
+                f"<h2 style='text-align: center;'>Mean Answer Scores</h2>", unsafe_allow_html=True)
+    get_kmeans_table(df)
+    show_clustering_heatmap(df, chinese_font)
+    st.markdown(
+                f"<h2 style='text-align: center;'>Agreement between personas</h2>", unsafe_allow_html=True)
+    create_treemap()
+def get_kmeans_table(df):
     # Select only the relevant columns for clustering
     df_likert_real_data = df[likert_flat_fields]
     cluster_means_real_data = df_likert_real_data.groupby(
         'Cluster').mean().reset_index()
     # Display the table
+    st.write("Mean response values for each likert question in each cluster:")
     st.table(cluster_means_real_data)
 def show_clustering_heatmap(df, chinese_font):
     # Filter the DataFrame to only include the Likert scale fields
     df_likert_data = df[likert_flat_fields]
     ax.set_ylabel('Cluster ID', fontproperties=chinese_font)
     # Rotate the x-axis labels for better readability
+    wrapped_labels = [textwrap.fill(label.get_text(), width=10) for label in ax.get_xticklabels()]
+    ax.set_xticklabels(wrapped_labels, rotation=45, fontproperties=chinese_font)
     st.pyplot(fig)
     return df_clustered, pca, cluster_centers
+def plot_scatterplot(df, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, title):
     # Create a figure and a set of subplots
     fig, ax = plt.subplots(figsize=(10, 10))
+    # Calculate cluster counts
+    cluster_counts = df['Cluster'].value_counts()
+    # Plot the scatterplot
+    scatter = sns.scatterplot(x='Component_1', y='Component_2', hue='Cluster',
+                              data=df, palette=cluster_palette, s=100, alpha=0.6, ax=ax)
+    # Get unique cluster labels sorted by value
+    unique_clusters = sorted(df['Cluster'].unique())
+    # Add the cluster centers for all clusters if plotting combined scatterplot
+    for label in unique_clusters:
+        # Use the label to index cluster_centers directly if it's a dictionary
+        center = cluster_centers[label]
+        ax.scatter(center[0], center[1], c=cluster_palette[label], s=200,
                    alpha=0.75, marker='o', edgecolors='k')
+        # Annotate the number of respondents in the cluster
+        ax.text(center[0], center[1], str(cluster_counts[label]), color='black',
+                ha='center', va='center', fontproperties=chinese_font)
+    # Set titles and labels
     ax.set_title(title, fontproperties=chinese_font)
+    ax.set_xlabel('Principal Component 1', fontproperties=chinese_font)
+    ax.set_ylabel('Principal Component 2', fontproperties=chinese_font)
+    # Extract handles and labels from the scatterplot
+    handles, labels = scatter.get_legend_handles_labels()
+    # Update labels with custom names and counts
+    new_labels = [f'Cluster {label}: {cluster_names[label]} (n={cluster_counts[label]})' for label in unique_clusters]
+    # Update the legend with the new labels
+    ax.legend(handles=handles, labels=new_labels, title='Personas', loc='upper right')
+    # Use the figure object (fig) to display the plot
     st.pyplot(fig)
 def create_treemap():
     categories = {
         'Ethical Consumption and Labor Concerns': 3.2,
     ax.axis('off')
     # Add a title to the plot
+    plt.title('Average Agreement Level by Question Category', fontsize=15)
     # Use the figure object (fig) in st.pyplot() to display the plot
     st.pyplot(fig)

page_shopping.py CHANGED Viewed

@@ -13,14 +13,15 @@ def show(df):
     # Load the Chinese font
     chinese_font = FontProperties(fname='mingliu.ttf', size=12)
     st.title("Shopping")
-    st.write("Clustering Students based on Product Feature choices")
-    st.title("Boycott Count")
-    show_boycott_count(df, font_prop=chinese_font)
     clusters = perform_kmodes_clustering(df, prod_feat_flat_fields)
-    st.title("Feature Preferences")
     show_radar_chart(clusters, font_prop=chinese_font)
-    st.title("Feature Preferences")
     plot_feature_preferences(clusters, font_prop=chinese_font)
 def show_boycott_count(df, font_prop):
     # Count the number of people who have invested and who have not
@@ -75,9 +76,9 @@ def perform_kmodes_clustering(df, feature_columns, n_clusters=3):
 def show_radar_chart(clusters, font_prop):
     df_dict={
-        'Eco-conscious Shopper (n=340)': clusters[0],
-        'Casual Eco-Interested)': clusters[1],
-        'Eco Advocate (n=126)': clusters[2]
     }
     feature_translations_dict = dict(zip(prod_feat_flat_fields, feature_translations))
@@ -146,9 +147,9 @@ def plot_feature_preferences(clusters, font_prop):
             "老實說我對任何環保資訊都沒有太多興趣\nHonestly, I'm Not Very Interested in Any Eco Information",
             "投資前比較公司的環保表現\nCompare Companies' Environmental Performance Before Investing"
         ],
-        'Eco-conscious Shopper (n=340)': [0.367, 0.415, 0.191, 0.176, 0.079, 1.000, 0.197, 0.265, 0.144, 0.241, 0.144, 0.332, 0.044, 0.188],
-        'Casual Eco-Interested (n=215)': [0.260, 0.163, 0.153, 0.191, 0.107, 0.000, 0.135, 0.219, 0.172, 0.186, 0.093, 0.214, 0.233, 0.130],
-        'Eco Advocate (n=126)': [0.825, 0.881, 0.460, 0.746, 0.230, 0.881, 0.667, 0.690, 0.421, 0.865, 0.468, 0.778, 0.143, 0.738]
 }
     # Create a DataFrame
     df = pd.DataFrame(data)

     # Load the Chinese font
     chinese_font = FontProperties(fname='mingliu.ttf', size=12)
     st.title("Shopping")
+    st.write("Clustering students based on AI-assistant feature choices")
     clusters = perform_kmodes_clustering(df, prod_feat_flat_fields)
+    st.markdown(
+                f"<h2 style='text-align: center;'>Feature Preferences</h2>", unsafe_allow_html=True)
     show_radar_chart(clusters, font_prop=chinese_font)
     plot_feature_preferences(clusters, font_prop=chinese_font)
+    st.markdown(
+                f"<h2 style='text-align: center;'>Boycott Count</h2>", unsafe_allow_html=True)
+    show_boycott_count(df, font_prop=chinese_font)
 def show_boycott_count(df, font_prop):
     # Count the number of people who have invested and who have not
 def show_radar_chart(clusters, font_prop):
     df_dict={
+        'Conscious (n=340)': clusters[0],
+        'Interested (n=215)': clusters[1],
+        'Advocate (n=126)': clusters[2]
     }
     feature_translations_dict = dict(zip(prod_feat_flat_fields, feature_translations))
             "老實說我對任何環保資訊都沒有太多興趣\nHonestly, I'm Not Very Interested in Any Eco Information",
             "投資前比較公司的環保表現\nCompare Companies' Environmental Performance Before Investing"
         ],
+        'Conscious (n=340)': [0.367, 0.415, 0.191, 0.176, 0.079, 1.000, 0.197, 0.265, 0.144, 0.241, 0.144, 0.332, 0.044, 0.188],
+        'Interested (n=215)': [0.260, 0.163, 0.153, 0.191, 0.107, 0.000, 0.135, 0.219, 0.172, 0.186, 0.093, 0.214, 0.233, 0.130],
+        'Advocate (n=126)': [0.825, 0.881, 0.460, 0.746, 0.230, 0.881, 0.667, 0.690, 0.421, 0.865, 0.468, 0.778, 0.143, 0.738]
 }
     # Create a DataFrame
     df = pd.DataFrame(data)

page_tests.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from scipy.stats import chisquare
 import streamlit as st
 import pandas as pd
 from fields.likert_flat_fields import likert_flat_fields
-@st.cache_data
 def show(df):
     st.title("Statistical Tests")
     show_chi_square_results(df)
 def show_chi_square_results(df):
@@ -13,40 +15,46 @@ def show_chi_square_results(df):
     for field in likert_flat_fields:
         observed_values = df[field].value_counts().sort_index()
-        observed_values = observed_values.astype(float)
         expected_values = [len(df) / len(observed_values)] * len(observed_values)
-        expected_values = [float(x) for x in expected_values]
         chi_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
         chi_square_results[field] = {'Chi-Square Statistic': chi_stat, 'p-value': p_value}
     chi_square_df = pd.DataFrame.from_dict(chi_square_results, orient='index')
     chi_square_df['p-value'] = chi_square_df['p-value'].astype(float)
     # Reset index to add a sequence number
     chi_square_df.reset_index(inplace=True)
     chi_square_df.rename(columns={'index': 'Question'}, inplace=True)
     # Define thresholds for highlighting
     chi_square_threshold = 300  # example threshold for high Chi-Square value
-    p_value_threshold = 1e-50   # example threshold for very low p-value
     # Apply the highlighting
-    def highlight(value):
-        if isinstance(value, float) and value > chi_square_threshold:
-            return "background-color: yellow"
-        elif isinstance(value, float) and value < p_value_threshold:
             return "background-color: yellow"
         else:
-            return ""
-     # Apply the highlighting to numeric columns only
-    chi_square_df_styled = chi_square_df.style.applymap(highlight, subset=pd.IndexSlice[:, ['Chi-Square Statistic', 'p-value']])
-    # Convert p-values to string after highlighting
-    chi_square_df['p-value'] = chi_square_df['p-value'].apply(lambda x: "{:.2e}".format(x))
-    # Convert styled DataFrame to HTML after p-value formatting
     chi_square_html = chi_square_df_styled.to_html(escape=False)
     # Display the HTML with unsafe_allow_html set to True
-    st.markdown(chi_square_html, unsafe_allow_html=True)

 from scipy.stats import chisquare
+from functools import partial
 import streamlit as st
 import pandas as pd
 from fields.likert_flat_fields import likert_flat_fields
+#@st.cache_data
 def show(df):
     st.title("Statistical Tests")
+    st.write("Yellow Chi-Square statistics (high) and pink p-values (low) are statistically meaningful")
     show_chi_square_results(df)
 def show_chi_square_results(df):
     for field in likert_flat_fields:
         observed_values = df[field].value_counts().sort_index()
+        observed_values = observed_values.reindex(index=range(1, 6), fill_value=0)
         expected_values = [len(df) / len(observed_values)] * len(observed_values)
         chi_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
         chi_square_results[field] = {'Chi-Square Statistic': chi_stat, 'p-value': p_value}
     chi_square_df = pd.DataFrame.from_dict(chi_square_results, orient='index')
     chi_square_df['p-value'] = chi_square_df['p-value'].astype(float)
+    # Convert p-values to string for formatting
+    chi_square_df['p-value'] = chi_square_df['p-value'].apply(lambda x: "{:.2e}".format(x))
     # Reset index to add a sequence number
     chi_square_df.reset_index(inplace=True)
     chi_square_df.rename(columns={'index': 'Question'}, inplace=True)
     # Define thresholds for highlighting
     chi_square_threshold = 300  # example threshold for high Chi-Square value
+    p_value_threshold = 1e-50  # example threshold for very low p-value
     # Apply the highlighting
+    def highlight(value, chi_square_threshold, p_value_threshold, col_name):
+        if col_name == 'Chi-Square Statistic' and float(value) > chi_square_threshold:
             return "background-color: yellow"
+        elif col_name == 'p-value' and float(value) < p_value_threshold:
+            return "background-color: pink"
         else:
+            return None
+    # Create partial functions for each column to apply the highlight with the column name
+    highlight_chi_square = partial(highlight, chi_square_threshold=chi_square_threshold,
+                                   p_value_threshold=p_value_threshold, col_name='Chi-Square Statistic')
+    highlight_p_value = partial(highlight, chi_square_threshold=chi_square_threshold,
+                                p_value_threshold=p_value_threshold, col_name='p-value')
+    # Apply the highlighting to the DataFrame
+    chi_square_df_styled = chi_square_df.style.applymap(highlight_chi_square, subset=['Chi-Square Statistic']) \
+                                               .applymap(highlight_p_value, subset=['p-value'])
+    # Convert styled DataFrame to HTML
     chi_square_html = chi_square_df_styled.to_html(escape=False)
     # Display the HTML with unsafe_allow_html set to True
+    st.markdown(chi_square_html, unsafe_allow_html=True)