Mark7549 commited on
Commit
05fa263
·
1 Parent(s): ca2c3e1

used tSNE to reduce dimensions for 3d plot

Browse files
Files changed (2) hide show
  1. app.py +6 -10
  2. plots.py +11 -99
app.py CHANGED
@@ -216,18 +216,14 @@ elif active_tab == "3D graph":
216
 
217
  if graph_button:
218
  time_slice_model = convert_time_name_to_model(time_slice)
219
- nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
220
- # nearest_neighbours_3d_vectors = create_3d_vectors(word, time_slice_model, nearest_neighbours_vectors)
221
- st.dataframe(nearest_neighbours_vectors)
222
- # new_3d_vectors = nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors)
223
- # st.dataframe(new_3d_vectors)
224
 
 
225
 
226
- fig, df = make_3d_plot4(nearest_neighbours_vectors, word, time_slice_model)
227
 
228
- st.dataframe(df)
229
 
230
- st.plotly_chart(fig)
231
 
232
 
233
 
@@ -330,7 +326,7 @@ elif active_tab == "FAQ":
330
  ## FAQ
331
  """)
332
 
333
- with st.expander("Which models is this interface based on?"):
334
  st.write(
335
  "This interface is based on five language models. \
336
  Language models are statistical models of language, \
@@ -341,7 +337,7 @@ elif active_tab == "FAQ":
341
  The models on which this interface is based are Word Embedding models."
342
  )
343
 
344
- with st.expander("Which corpus was used to train the models?"):
345
  st.write(
346
  "The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
347
  )
 
216
 
217
  if graph_button:
218
  time_slice_model = convert_time_name_to_model(time_slice)
219
+ nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
 
 
 
 
220
 
221
+ fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
222
 
223
+ # st.dataframe(df)
224
 
225
+ st.plotly_chart(fig)
226
 
 
227
 
228
 
229
 
 
326
  ## FAQ
327
  """)
328
 
329
+ with st.expander('''**Which models is this interface based on?**'''):
330
  st.write(
331
  "This interface is based on five language models. \
332
  Language models are statistical models of language, \
 
337
  The models on which this interface is based are Word Embedding models."
338
  )
339
 
340
+ with st.expander('''**Which corpus was used to train the models?**'''):
341
  st.write(
342
  "The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
343
  )
plots.py CHANGED
@@ -7,125 +7,37 @@ import pandas as pd
7
  from word2vec import *
8
  from sklearn.preprocessing import StandardScaler
9
  import plotly.express as px
 
10
 
11
 
12
-
13
- # def make_3d_plot(new_3d_vectors):
14
- # """
15
- # Turn DataFrame of 3D vectors into a 3D plot
16
- # DataFrame structure: ['word', 'cosine_sim', '3d_vector']
17
- # """
18
- # fig = plt.figure()
19
- # ax = fig.add_subplot(projection='3d')
20
-
21
- # plt.ion()
22
-
23
- # # Unpack vectors and labels from DataFrame
24
- # labels = new_3d_vectors['word']
25
- # x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
26
- # y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
27
- # z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])
28
-
29
- # # Plot points
30
- # ax.scatter(x, y, z)
31
-
32
- # # Add labels
33
- # for i, label in enumerate(labels):
34
- # ax.text(x[i], y[i], z[i], label)
35
-
36
- # # Set labels and title
37
- # ax.set_xlabel('X')
38
- # ax.set_ylabel('Y')
39
- # ax.set_zlabel('Z')
40
- # ax.set_title('3D plot of word vectors')
41
-
42
- # return fig
43
-
44
-
45
-
46
-
47
- # def make_3d_plot2(df):
48
- # """
49
- # Turn DataFrame of 3D vectors into a 3D plot using plotly
50
- # DataFrame structure: ['word', 'cosine_sim', '3d_vector']
51
- # """
52
- # vectors = df['3d_vector'].tolist()
53
- # fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
54
- # return fig
55
-
56
-
57
- # def make_3d_plot3(vectors_list, word, time_slice_model):
58
- # """
59
- # Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
60
- # List structure: [(word, model_name, vector, cosine_sim)]
61
- # """
62
- # # Load model
63
- # model = load_word2vec_model(f'models/{time_slice_model}.model')
64
-
65
- # # Make UMAP model and fit it to the vectors
66
- # umap_model = umap.UMAP(n_components=3)
67
- # umap_model.fit(model.wv.vectors)
68
-
69
- # # Transform the vectors to 3D
70
- # transformed_vectors = umap_model.transform(model.wv.vectors)
71
-
72
-
73
- # # Create DataFrame from the transformed vectors
74
- # df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
75
-
76
- # # Add word and cosine similarity to DataFrame
77
- # df['word'] = model.wv.index_to_key
78
-
79
- # # Filter the DataFrame for words in vectors_list and add cosine similarity
80
- # word_list = [v[0] for v in vectors_list]
81
- # cosine_sim_list = [v[3] for v in vectors_list]
82
-
83
- # # Ensure that the word list and cosine similarity list are aligned properly
84
- # df = df[df['word'].isin(word_list)]
85
- # df['cosine_sim'] = cosine_sim_list
86
-
87
- # # Create plot
88
- # fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
89
- # fig.update_traces(marker=dict(size=5))
90
- # fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
91
-
92
- # return fig, df
93
-
94
-
95
-
96
- def make_3d_plot4(vectors_list, word, time_slice_model):
97
  """
98
- Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
99
- List structure: [(word, model_name, vector, cosine_sim)]
100
  """
101
  # Load model
102
  model = load_word2vec_model(f'models/{time_slice_model}.model')
103
  model_dict = model_dictionary(model)
104
 
105
-
106
  # Extract vectors and names from model_dict
107
  all_vector_names = list(model_dict.keys())
108
  all_vectors = list(model_dict.values())
109
 
110
-
111
- # Scale the vectors
112
  scaler = StandardScaler()
113
  vectors_scaled = scaler.fit_transform(all_vectors)
114
 
115
- # Make UMAP model and fit it to the scaled vectors
116
- umap_model = umap.UMAP(n_components=3)
117
- umap_result = umap_model.fit_transform(vectors_scaled)
118
 
119
- # Now umap_result contains the 3D representations of the vectors
120
  # Associate the names with the 3D representations
121
- result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]
122
-
123
 
124
  # Only keep the vectors that are in vectors_list and their cosine similarities
125
  result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
126
  result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
127
 
128
-
129
  # Create DataFrame from the transformed vectors
130
  df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
131
 
@@ -136,10 +48,10 @@ def make_3d_plot4(vectors_list, word, time_slice_model):
136
  y = df['3d_vector'].apply(lambda v: v[1])
137
  z = df['3d_vector'].apply(lambda v: v[2])
138
 
139
-
140
- # Create plot
141
  fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
142
  fig.update_traces(marker=dict(size=5))
143
  fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
144
 
145
  return fig, df
 
 
7
  from word2vec import *
8
  from sklearn.preprocessing import StandardScaler
9
  import plotly.express as px
10
+ from sklearn.manifold import TSNE
11
 
12
 
13
+ def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  """
15
+ Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
16
+ List structure: [(word, model_name, vector, cosine_sim)]
17
  """
18
  # Load model
19
  model = load_word2vec_model(f'models/{time_slice_model}.model')
20
  model_dict = model_dictionary(model)
21
 
 
22
  # Extract vectors and names from model_dict
23
  all_vector_names = list(model_dict.keys())
24
  all_vectors = list(model_dict.values())
25
 
26
+ # Scale vectors
 
27
  scaler = StandardScaler()
28
  vectors_scaled = scaler.fit_transform(all_vectors)
29
 
30
+ # Make t-SNE model and fit it to the scaled vectors
31
+ tsne_model = TSNE(n_components=3, random_state=0)
32
+ tsne_result = tsne_model.fit_transform(vectors_scaled)
33
 
 
34
  # Associate the names with the 3D representations
35
+ result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
 
36
 
37
  # Only keep the vectors that are in vectors_list and their cosine similarities
38
  result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
39
  result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
40
 
 
41
  # Create DataFrame from the transformed vectors
42
  df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
43
 
 
48
  y = df['3d_vector'].apply(lambda v: v[1])
49
  z = df['3d_vector'].apply(lambda v: v[2])
50
 
51
+ # Plot
 
52
  fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
53
  fig.update_traces(marker=dict(size=5))
54
  fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
55
 
56
  return fig, df
57
+