Spaces:

nazneen
/

error-analysis

Runtime error

App Files Files Community

nazneen commited on Jul 19, 2022

Commit

945ee1d

1 Parent(s): 050aca6

multi error clusters

Browse files

Files changed (1) hide show

app.py +29 -17

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 import torch
-import json
 from tqdm import tqdm
 from math import floor
 from datasets import load_dataset
@@ -104,7 +104,8 @@ def quant_panel(embedding_df):
         st.markdown("* Each **point** is an input example.")
         st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
         st.markdown("* The **shape** of each point reflects the label category --  positive (diamond) or negative sentiment (circle).")
-    st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
 def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
@@ -136,7 +137,7 @@ def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.
     for i, (token) in enumerate(tokens_sorted[:top_k]):
         top_tokens.append(['%10s' % (tokenizer.decode(token)), '%.4f' % (token_frequencies[token]), '%.4f' % (
             token_frequencies_error[token]), '%4.2f' % (token_lrs[token])])
-    return pd.DataFrame(top_tokens, columns=['Token', 'Freq', 'Freq error slice', 'lrs'])
 @st.cache(ttl=600)
@@ -160,12 +161,12 @@ def clustering(data,num_clusters):
     return data, assigned_clusters
 def kmeans(df, num_clusters=3):
-    data_hl = df.loc[df['slice'] == 'high-loss']
-    data_kmeans,clusters = clustering(data_hl,num_clusters)
-    merged = pd.merge(df, data_kmeans, left_index=True, right_index=True, how='outer', suffixes=('', '_y'))
-    merged.drop(merged.filter(regex='_y$').columns.tolist(),axis=1,inplace=True)
-    merged['cluster'] = merged['cluster'].fillna(num_clusters).astype('int')
-    return merged
 def distance_from_centroid(row):
     return sdist.norm(row['embedding'] - row['centroid'].tolist())
@@ -173,16 +174,16 @@ def distance_from_centroid(row):
 @st.cache(ttl=600)
 def topic_distribution(weights, smoothing=0.01):
     topic_frequencies = defaultdict(float)
-    topic_frequencies_spotlight = defaultdict(float)
     weights_uniform = np.full_like(weights, 1 / len(weights))
     num_examples = len(weights)
     for i in range(num_examples):
         example = dataset[i]
         category = example['title']
         topic_frequencies[category] += weights_uniform[i]
-        topic_frequencies_spotlight[category] += weights[i]
-    topic_ratios = {c: (smoothing + topic_frequencies_spotlight[c]) / (
         smoothing + topic_frequencies[c]) for c in topic_frequencies}
     categories_sorted = map(lambda x: x[0], sorted(
@@ -191,11 +192,9 @@ def topic_distribution(weights, smoothing=0.01):
     topic_distr = []
     for category in categories_sorted:
         topic_distr.append(['%.3f' % topic_frequencies[category], '%.3f' %
-                           topic_frequencies_spotlight[category], '%.2f' % topic_ratios[category], '%s' % category])
     return pd.DataFrame(topic_distr, columns=['Overall frequency', 'Error frequency', 'Ratio', 'Category'])
-    # for category in categories_sorted:
-    #    return(topic_frequencies[category], topic_frequencies_spotlight[category], topic_ratios[category], category)
 def populate_session(dataset,model):
     data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
@@ -239,13 +238,17 @@ if __name__ == "__main__":
     #populate_session(dataset, model)
     data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
     loss_quantile = st.sidebar.slider(
-        "Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.95
     )
     data_df['loss'] = data_df['loss'].astype(float)
     losses = data_df['loss']
     high_loss = losses.quantile(loss_quantile)
     data_df['slice'] = 'high-loss'
     data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
     with lcol:
         st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
@@ -279,7 +282,16 @@ if __name__ == "__main__":
     if run_kmeans == 'True':
         with st.spinner(text='running kmeans...'):
-            merged = kmeans(data_df,num_clusters=num_clusters)
     with st.spinner(text='loading visualization...'):
         quant_panel(merged)

 import numpy as np
 import pandas as pd
 import torch
+import math
 from tqdm import tqdm
 from math import floor
 from datasets import load_dataset
         st.markdown("* Each **point** is an input example.")
         st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
         st.markdown("* The **shape** of each point reflects the label category --  positive (diamond) or negative sentiment (circle).")
+    #st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
+    st.altair_chart(data_comparison(embedding_df), use_container_width=True)
 def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
     for i, (token) in enumerate(tokens_sorted[:top_k]):
         top_tokens.append(['%10s' % (tokenizer.decode(token)), '%.4f' % (token_frequencies[token]), '%.4f' % (
             token_frequencies_error[token]), '%4.2f' % (token_lrs[token])])
+    return pd.DataFrame(top_tokens, columns=['Token', 'Freq', 'Freq error slice', 'Ratio w/ smoothing'])
 @st.cache(ttl=600)
     return data, assigned_clusters
 def kmeans(df, num_clusters=3):
+    #data_hl = df.loc[df['slice'] == 'high-loss']
+    data_kmeans,clusters = clustering(df,num_clusters)
+    #merged = pd.merge(df, data_kmeans, left_index=True, right_index=True, how='outer', suffixes=('', '_y'))
+    #merged.drop(merged.filter(regex='_y$').columns.tolist(),axis=1,inplace=True)
+    #merged['cluster'] = merged['cluster'].fillna(num_clusters).astype('int')
+    return data_kmeans
 def distance_from_centroid(row):
     return sdist.norm(row['embedding'] - row['centroid'].tolist())
 @st.cache(ttl=600)
 def topic_distribution(weights, smoothing=0.01):
     topic_frequencies = defaultdict(float)
+    topic_frequencies_error= defaultdict(float)
     weights_uniform = np.full_like(weights, 1 / len(weights))
     num_examples = len(weights)
     for i in range(num_examples):
         example = dataset[i]
         category = example['title']
         topic_frequencies[category] += weights_uniform[i]
+        topic_frequencies_error[category] += weights[i]
+    topic_ratios = {c: (smoothing + topic_frequencies_error[c]) / (
         smoothing + topic_frequencies[c]) for c in topic_frequencies}
     categories_sorted = map(lambda x: x[0], sorted(
     topic_distr = []
     for category in categories_sorted:
         topic_distr.append(['%.3f' % topic_frequencies[category], '%.3f' %
+                           topic_frequencies_error[category], '%.2f' % topic_ratios[category], '%s' % category])
     return pd.DataFrame(topic_distr, columns=['Overall frequency', 'Error frequency', 'Ratio', 'Category'])
 def populate_session(dataset,model):
     data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
     #populate_session(dataset, model)
     data_df = read_file_to_df('./assets/data/'+dataset+ '_'+ model+'.parquet')
     loss_quantile = st.sidebar.slider(
+        "Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.99
     )
+    data_df = data_df.drop(data_df[data_df.pred == data_df.label].index) #drop rows that are not errors
     data_df['loss'] = data_df['loss'].astype(float)
     losses = data_df['loss']
     high_loss = losses.quantile(loss_quantile)
     data_df['slice'] = 'high-loss'
     data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
+    data_hl = data_df.drop(data_df[data_df['slice'] == 'low-loss'].index) #drop rows that are not hl
+    data_ll = data_df.drop(data_df[data_df['slice'] == 'high-loss'].index)
+    df_list = [d for _, d in data_hl.groupby(['label'])] # this is to allow clustering over each error type. fp, fn for binary classification
     with lcol:
         st.markdown('<h3>Error Slices</h3>',unsafe_allow_html=True)
     if run_kmeans == 'True':
         with st.spinner(text='running kmeans...'):
+            merged = pd.DataFrame()
+            ind=0
+            for df in df_list:
+                #num_clusters= int(math.sqrt(len(df)/2))
+                kmeans_df = kmeans(df,num_clusters=num_clusters)
+                #print(kmeans_df.loc[kmeans_df['cluster'].idxmax()])
+                kmeans_df['cluster'] = kmeans_df['cluster'] + ind*num_clusters
+                ind = ind+1
+                merged = pd.concat([merged, kmeans_df])
+            merged = pd.concat([merged, data_ll])
     with st.spinner(text='loading visualization...'):
         quant_panel(merged)