acmc commited on
Commit
115f2ee
·
1 Parent(s): 36c5b68
app.py CHANGED
@@ -112,30 +112,30 @@ def process_user_input_concept(concept_chooser):
112
 
113
  # Now, average the similarities
114
  scores = np.stack(list(all_similarities.values()), axis=0)
115
- scores = np.mean(all_similarities, axis=0)
116
 
117
  table_df = pd.DataFrame(
118
  {
119
- "Institution": s,
120
- "Mean similarity": scores.flatten(),
121
- "Institution name": all_ids_institutions[:, 1],
122
  # "num_articles": all_ids_institutions[:, 2].astype(int),
123
  }
124
  )
125
 
126
  # Add the individual similarities
127
  for i, concept in enumerate(chosen_concepts):
128
- table_df[f"Similarity to {chosen_concepts_names[i]}"] = all_similarities[concept]
129
 
130
  # Reorder the columns so that the mean similarity is after the individual similarities and before the institution name
131
  table_df = table_df[
132
- ["Institution"]
133
- + [f"Similarity to {chosen_concepts_names[i]}" for i in range(len(chosen_concepts))]
134
- + ["Mean similarity", "Institution name"]
135
  ]
136
 
137
  # Sort by mean similarity
138
- table_df = table_df.sort_values(by=["Mean similarity"], ascending=False)
139
 
140
  concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
141
  return (
@@ -151,7 +151,7 @@ def calculate_emdeddings_and_pca(table):
151
  gr.Info("Performing PCA and clustering...")
152
  # Perform PCA
153
  embeddings_of_institutions = model.get_embeddings(
154
- entities=np.array(table["Institution"])
155
  )
156
 
157
  entity_embeddings_pca = pca(embeddings_of_institutions)
@@ -161,9 +161,9 @@ def calculate_emdeddings_and_pca(table):
161
 
162
  plot_df = pd.DataFrame(
163
  {
164
- "Embedding (coord 1)": entity_embeddings_pca[:, 0],
165
- "Embedding (coord 2)": entity_embeddings_pca[:, 1],
166
- "Cluster": "Cluster" + pd.Series(clusters).astype(str),
167
  }
168
  )
169
 
@@ -173,16 +173,16 @@ def calculate_emdeddings_and_pca(table):
173
 
174
 
175
  def click_on_institution(table, embeddings_var, evt: gr.SelectData):
176
- institution_id = table["Institution"][evt.index[0]]
177
  try:
178
  embeddings_df = embeddings_var["embeddings_df"]
179
  plot_df = pd.DataFrame(
180
  {
181
- "Institution": table["Institution"].values,
182
- "Institution name": table["Institution name"].values,
183
- "Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values,
184
- "Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values,
185
- "Cluster": embeddings_df["Cluster"].values,
186
  # "num_articles": table["num_articles"].values,
187
  }
188
  )
@@ -196,11 +196,11 @@ def click_on_show_plot(table):
196
 
197
  plot_df = pd.DataFrame(
198
  {
199
- "Institution": table["Institution"].values,
200
- "Institution_name": table["Institution Name"].values,
201
- "Embedding (coord 1)": embeddings_df["Embedding (coord 1)"].values,
202
- "Embedding (coord 2)": embeddings_df["Embedding (coord 2)"].values,
203
- "Cluster": embeddings_df["Cluster"].values,
204
  # "num_articles": table["num_articles"].values,
205
  }
206
  )
@@ -215,17 +215,17 @@ def plot_embeddings(plot_df, institution_id):
215
  # fig.title("{} embeddings".format(parameter).capitalize())
216
  ax = sns.scatterplot(
217
  data=plot_df,
218
- x="Embedding (coord 1)",
219
- y="Embedding (coord 2)",
220
- hue="Cluster",
221
  )
222
 
223
- row_of_institution = plot_df[plot_df["Institution"] == institution_id]
224
  if not row_of_institution.empty:
225
  ax.text(
226
- row_of_institution["Embedding (coord 1)"],
227
- row_of_institution["Embedding (coord 2)"],
228
- row_of_institution["Institution name"].values[0],
229
  horizontalalignment="left",
230
  size="medium",
231
  color="black",
@@ -233,20 +233,20 @@ def plot_embeddings(plot_df, institution_id):
233
  )
234
  # Also draw a point for the institution
235
  ax.scatter(
236
- row_of_institution["Embedding (coord 1)"],
237
- row_of_institution["Embedding (coord 2)"],
238
  color="black",
239
  s=100,
240
  marker="x",
241
  )
242
  # texts = []
243
  # for i, point in plot_df.iterrows():
244
- # if point["Institution"] == institution_id:
245
  # texts.append(
246
  # fig.text(
247
- # point["Embedding (coord 1)"] + 0.02,
248
- # point["Embedding (coord 2)"] + 0.01,
249
- # str(point["Institution name"]),
250
  # )
251
  # )
252
  # adjust_text(texts)
@@ -257,9 +257,9 @@ def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.Sele
257
  """
258
  Get the authors of an institution
259
  """
260
- institution = institutions_table["Institution"][0]
261
  number_of_row = evt.index[0]
262
- institution = institutions_table["Institution"][number_of_row]
263
  concepts = separate_concepts(concept_chooser)
264
  results_dfs = []
265
  for concept in concepts:
@@ -269,7 +269,7 @@ def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.Sele
269
  WHERE {{
270
  ?author a <urn:acmcmc:unis:Author> .
271
  ?author <urn:acmcmc:unis:name> ?name .
272
- ?article <urn:acmcmc:unis:written_in_institution> <{Institution}> .
273
  ?article <urn:acmcmc:unis:has_author> ?author .
274
  ?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
275
  }}
 
112
 
113
  # Now, average the similarities
114
  scores = np.stack(list(all_similarities.values()), axis=0)
115
+ scores = np.mean(scores, axis=0)
116
 
117
  table_df = pd.DataFrame(
118
  {
119
+ "institution": s,
120
+ "mean_similarity": scores.flatten(),
121
+ "institution_name": all_ids_institutions[:, 1],
122
  # "num_articles": all_ids_institutions[:, 2].astype(int),
123
  }
124
  )
125
 
126
  # Add the individual similarities
127
  for i, concept in enumerate(chosen_concepts):
128
+ table_df[f"similarity_to_{chosen_concepts_names[i]}"] = all_similarities[concept]
129
 
130
  # Reorder the columns so that the mean similarity is after the individual similarities and before the institution name
131
  table_df = table_df[
132
+ ["institution"]
133
+ + [f"similarity_to_{chosen_concepts_names[i]}" for i in range(len(chosen_concepts))]
134
+ + ["mean_similarity", "institution_name"]
135
  ]
136
 
137
  # Sort by mean similarity
138
+ table_df = table_df.sort_values(by=["mean_similarity"], ascending=False)
139
 
140
  concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
141
  return (
 
151
  gr.Info("Performing PCA and clustering...")
152
  # Perform PCA
153
  embeddings_of_institutions = model.get_embeddings(
154
+ entities=np.array(table["institution"])
155
  )
156
 
157
  entity_embeddings_pca = pca(embeddings_of_institutions)
 
161
 
162
  plot_df = pd.DataFrame(
163
  {
164
+ "embedding_x": entity_embeddings_pca[:, 0],
165
+ "embedding_y": entity_embeddings_pca[:, 1],
166
+ "cluster": "cluster" + pd.Series(clusters).astype(str),
167
  }
168
  )
169
 
 
173
 
174
 
175
  def click_on_institution(table, embeddings_var, evt: gr.SelectData):
176
+ institution_id = table["institution"][evt.index[0]]
177
  try:
178
  embeddings_df = embeddings_var["embeddings_df"]
179
  plot_df = pd.DataFrame(
180
  {
181
+ "institution": table["institution"].values,
182
+ "institution_name": table["institution_name"].values,
183
+ "embedding_x": embeddings_df["embedding_x"].values,
184
+ "embedding_y": embeddings_df["embedding_y"].values,
185
+ "cluster": embeddings_df["cluster"].values,
186
  # "num_articles": table["num_articles"].values,
187
  }
188
  )
 
196
 
197
  plot_df = pd.DataFrame(
198
  {
199
+ "institution": table["institution"].values,
200
+ "Institution_name": table["institution Name"].values,
201
+ "embedding_x": embeddings_df["embedding_x"].values,
202
+ "embedding_y": embeddings_df["embedding_y"].values,
203
+ "cluster": embeddings_df["cluster"].values,
204
  # "num_articles": table["num_articles"].values,
205
  }
206
  )
 
215
  # fig.title("{} embeddings".format(parameter).capitalize())
216
  ax = sns.scatterplot(
217
  data=plot_df,
218
+ x="embedding_x",
219
+ y="embedding_y",
220
+ hue="cluster",
221
  )
222
 
223
+ row_of_institution = plot_df[plot_df["institution"] == institution_id]
224
  if not row_of_institution.empty:
225
  ax.text(
226
+ row_of_institution["embedding_x"],
227
+ row_of_institution["embedding_y"],
228
+ row_of_institution["institution_name"].values[0],
229
  horizontalalignment="left",
230
  size="medium",
231
  color="black",
 
233
  )
234
  # Also draw a point for the institution
235
  ax.scatter(
236
+ row_of_institution["embedding_x"],
237
+ row_of_institution["embedding_y"],
238
  color="black",
239
  s=100,
240
  marker="x",
241
  )
242
  # texts = []
243
  # for i, point in plot_df.iterrows():
244
+ # if point["institution"] == institution_id:
245
  # texts.append(
246
  # fig.text(
247
+ # point["embedding_x"] + 0.02,
248
+ # point["embedding_y"] + 0.01,
249
+ # str(point["institution_name"]),
250
  # )
251
  # )
252
  # adjust_text(texts)
 
257
  """
258
  Get the authors of an institution
259
  """
260
+ institution = institutions_table["institution"][0]
261
  number_of_row = evt.index[0]
262
+ institution = institutions_table["institution"][number_of_row]
263
  concepts = separate_concepts(concept_chooser)
264
  results_dfs = []
265
  for concept in concepts:
 
269
  WHERE {{
270
  ?author a <urn:acmcmc:unis:Author> .
271
  ?author <urn:acmcmc:unis:name> ?name .
272
+ ?article <urn:acmcmc:unis:written_in_institution> <{institution}> .
273
  ?article <urn:acmcmc:unis:has_author> ?author .
274
  ?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
275
  }}
institutions.csv CHANGED
The diff for this file is too large to render. See raw diff
 
model/.data-00000-of-00001 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa8f3d8bd8f7a741cfe1ef560e5d2f894314342b51ec9a60844d5fc796b8e0c5
3
- size 2350332477
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1c911cf8812ae52e3a75dbb51ddf610067a96eb3b807a6f0bd7deb6dfc95ffc
3
+ size 1411474077
model/.index CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:364d14e1bb0830e861ef9c87ee188e8b00f90eea93ea07f828d69c3daa0a4139
3
  size 294
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49e9e5e144cbd54aa3a0a2a2e0a77395d682d9850b38ecd925622e386ea25f34
3
  size 294
model/model_metadata.ampkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95e4a9f0906a1e60acbe7771e223dae8fa88859afb65066cef0541c1cbc78378
3
- size 676909665
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcc9837fd93c7604c58c02ff89219154fa4129cccae86f4d5995feb32d4726a
3
+ size 406330271