Spaces:

nazneen
/

error-analysis

Runtime error

App Files Files Community

nazneen commited on May 23, 2022

Commit

7cba420

1 Parent(s): bbb450c

add cache

Browse files

Files changed (1) hide show

app.py +14 -10

app.py CHANGED Viewed

@@ -198,7 +198,7 @@ def topic_distribution(weights, smoothing=0.01):
     #    return(topic_frequencies[category], topic_frequencies_spotlight[category], topic_ratios[category], category)
 def populate_session(dataset,model):
-    data_df = pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'.parquet')
     if model == 'albert-base-v2-yelp-polarity':
         tokenizer = AutoTokenizer.from_pretrained('textattack/'+model)
     else:
@@ -208,7 +208,9 @@ def populate_session(dataset,model):
     if "selected_slice" not in st.session_state:
         st.session_state["selected_slice"] = None
 if __name__ == "__main__":
     ### STREAMLIT APP CONGFIG ###
@@ -235,7 +237,7 @@ if __name__ == "__main__":
     ### LOAD DATA AND SESSION VARIABLES ###
     ##uncomment the next next line to run dynamically and not from file
     #populate_session(dataset, model)
-    data_df = pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'.parquet')
     loss_quantile = st.sidebar.slider(
         "Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.95
     )
@@ -250,7 +252,7 @@ if __name__ == "__main__":
             st.markdown('<h3>Word Distribution in Error Slice</h3>', unsafe_allow_html=True)
             #uncomment the next two lines to run dynamically and not from file
             #commontokens = frequent_tokens(data_df, tokenizer, loss_quantile=loss_quantile)
-            commontokens = pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'_commontokens.parquet')
             with st.expander("How to read the table:"):
                 st.markdown("* The table displays the most frequent tokens in error slices, relative to their frequencies in the val set.")
             st.write(commontokens)
@@ -260,20 +262,22 @@ if __name__ == "__main__":
     num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
     if run_kmeans == 'True':
-        merged = kmeans(data_df,num_clusters=num_clusters)
     with lcol:
         st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
-        dataframe=pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'_error-slices.parquet')
         #uncomment the next next line to run dynamically and not from file
         # dataframe = merged[['content', 'label', 'pred', 'loss', 'cluster']].sort_values(
         #     by=['loss'], ascending=False)
         # table_html = dataframe.to_html(
         #     columns=['content', 'label', 'pred', 'loss', 'cluster'], max_rows=50)
         # table_html = table_html.replace("<th>", '<th align="left">')  # left-align the headers
-        with st.expander("How to read the table:"):
-            st.markdown("* *Error slice* refers to the subset of evaluation dataset the model performs poorly on.")
-            st.markdown("* The table displays model error slices on the evaluation dataset, sorted by loss.")
-            st.markdown("* Each row is an input example that includes the label, model pred, loss, and error cluster.")
         st.write(dataframe,width=900, height=300)
     quant_panel(merged)

     #    return(topic_frequencies[category], topic_frequencies_spotlight[category], topic_ratios[category], category)
 def populate_session(dataset,model):
+    data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
     if model == 'albert-base-v2-yelp-polarity':
         tokenizer = AutoTokenizer.from_pretrained('textattack/'+model)
     else:
     if "selected_slice" not in st.session_state:
         st.session_state["selected_slice"] = None
+@st.cache(ttl=600)
+def read_file_to_df(file):
+   return pd.read_parquet(file)
 if __name__ == "__main__":
     ### STREAMLIT APP CONGFIG ###
     ### LOAD DATA AND SESSION VARIABLES ###
     ##uncomment the next next line to run dynamically and not from file
     #populate_session(dataset, model)
+    data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
     loss_quantile = st.sidebar.slider(
         "Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.95
     )
             st.markdown('<h3>Word Distribution in Error Slice</h3>', unsafe_allow_html=True)
             #uncomment the next two lines to run dynamically and not from file
             #commontokens = frequent_tokens(data_df, tokenizer, loss_quantile=loss_quantile)
+            commontokens = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'_commontokens.parquet')
             with st.expander("How to read the table:"):
                 st.markdown("* The table displays the most frequent tokens in error slices, relative to their frequencies in the val set.")
             st.write(commontokens)
     num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
     if run_kmeans == 'True':
+        with st.spinner(text='running kmeans...'):
+            merged = kmeans(data_df,num_clusters=num_clusters)
     with lcol:
         st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
+        with st.expander("How to read the table:"):
+            st.markdown("* *Error slice* refers to the subset of evaluation dataset the model performs poorly on.")
+            st.markdown("* The table displays model error slices on the evaluation dataset, sorted by loss.")
+            st.markdown("* Each row is an input example that includes the label, model pred, loss, and error cluster.")
+        with st.spinner(text='loading error slice...'):
+            dataframe=read_file_to_df('./assets/data/'+dataset+ '_'+ model+'_error-slices.parquet')
         #uncomment the next next line to run dynamically and not from file
         # dataframe = merged[['content', 'label', 'pred', 'loss', 'cluster']].sort_values(
         #     by=['loss'], ascending=False)
         # table_html = dataframe.to_html(
         #     columns=['content', 'label', 'pred', 'loss', 'cluster'], max_rows=50)
         # table_html = table_html.replace("<th>", '<th align="left">')  # left-align the headers
         st.write(dataframe,width=900, height=300)
     quant_panel(merged)