nazneen commited on
Commit
bbb450c
1 Parent(s): 49418b5

added offline files

Browse files
Files changed (1) hide show
  1. app.py +4 -9
app.py CHANGED
@@ -140,16 +140,15 @@ def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.
140
 
141
 
142
  @st.cache(ttl=600)
143
- def get_data(spotlight, emb):
144
- preds = spotlight.outputs.numpy()
145
- losses = spotlight.losses.numpy()
146
  embeddings = pd.DataFrame(emb, columns=['x', 'y'])
147
  num_examples = len(losses)
148
  # dataset_labels = [dataset[i]['label'] for i in range(num_examples)]
149
  return pd.concat([pd.DataFrame(np.transpose(np.vstack([dataset[:num_examples]['content'],
150
  dataset[:num_examples]['label'], preds, losses])), columns=['content', 'label', 'pred', 'loss']), embeddings], axis=1)
151
 
152
- @st.cache(ttl=600)
153
  def clustering(data,num_clusters):
154
  X = np.array(data['embedding'].tolist())
155
  kclusterer = KMeansClusterer(
@@ -158,11 +157,8 @@ def clustering(data,num_clusters):
158
  assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
159
  data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
160
  data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
161
-
162
-
163
  return data, assigned_clusters
164
 
165
- @st.cache(ttl=600)
166
  def kmeans(df, num_clusters=3):
167
  data_hl = df.loc[df['slice'] == 'high-loss']
168
  data_kmeans,clusters = clustering(data_hl,num_clusters)
@@ -171,7 +167,6 @@ def kmeans(df, num_clusters=3):
171
  merged['cluster'] = merged['cluster'].fillna(num_clusters).astype('int')
172
  return merged
173
 
174
- @st.cache(ttl=600)
175
  def distance_from_centroid(row):
176
  return sdist.norm(row['embedding'] - row['centroid'].tolist())
177
 
@@ -249,7 +244,7 @@ if __name__ == "__main__":
249
  high_loss = losses.quantile(loss_quantile)
250
  data_df['slice'] = 'high-loss'
251
  data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
252
-
253
  with rcol:
254
  with st.spinner(text='loading...'):
255
  st.markdown('<h3>Word Distribution in Error Slice</h3>', unsafe_allow_html=True)
 
140
 
141
 
142
  @st.cache(ttl=600)
143
+ def get_data(inference, emb):
144
+ preds = inference.outputs.numpy()
145
+ losses = inference.losses.numpy()
146
  embeddings = pd.DataFrame(emb, columns=['x', 'y'])
147
  num_examples = len(losses)
148
  # dataset_labels = [dataset[i]['label'] for i in range(num_examples)]
149
  return pd.concat([pd.DataFrame(np.transpose(np.vstack([dataset[:num_examples]['content'],
150
  dataset[:num_examples]['label'], preds, losses])), columns=['content', 'label', 'pred', 'loss']), embeddings], axis=1)
151
 
 
152
  def clustering(data,num_clusters):
153
  X = np.array(data['embedding'].tolist())
154
  kclusterer = KMeansClusterer(
 
157
  assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
158
  data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
159
  data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
 
 
160
  return data, assigned_clusters
161
 
 
162
  def kmeans(df, num_clusters=3):
163
  data_hl = df.loc[df['slice'] == 'high-loss']
164
  data_kmeans,clusters = clustering(data_hl,num_clusters)
 
167
  merged['cluster'] = merged['cluster'].fillna(num_clusters).astype('int')
168
  return merged
169
 
 
170
  def distance_from_centroid(row):
171
  return sdist.norm(row['embedding'] - row['centroid'].tolist())
172
 
 
244
  high_loss = losses.quantile(loss_quantile)
245
  data_df['slice'] = 'high-loss'
246
  data_df['slice'] = data_df['slice'].where(data_df['loss'] > high_loss, 'low-loss')
247
+
248
  with rcol:
249
  with st.spinner(text='loading...'):
250
  st.markdown('<h3>Word Distribution in Error Slice</h3>', unsafe_allow_html=True)