used tSNE to reduce dimensions for 3d plot
Browse files
app.py
CHANGED
@@ -216,18 +216,14 @@ elif active_tab == "3D graph":
|
|
216 |
|
217 |
if graph_button:
|
218 |
time_slice_model = convert_time_name_to_model(time_slice)
|
219 |
-
nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
|
220 |
-
# nearest_neighbours_3d_vectors = create_3d_vectors(word, time_slice_model, nearest_neighbours_vectors)
|
221 |
-
st.dataframe(nearest_neighbours_vectors)
|
222 |
-
# new_3d_vectors = nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors)
|
223 |
-
# st.dataframe(new_3d_vectors)
|
224 |
|
|
|
225 |
|
226 |
-
|
227 |
|
228 |
-
st.
|
229 |
|
230 |
-
st.plotly_chart(fig)
|
231 |
|
232 |
|
233 |
|
@@ -330,7 +326,7 @@ elif active_tab == "FAQ":
|
|
330 |
## FAQ
|
331 |
""")
|
332 |
|
333 |
-
with st.expander(
|
334 |
st.write(
|
335 |
"This interface is based on five language models. \
|
336 |
Language models are statistical models of language, \
|
@@ -341,7 +337,7 @@ elif active_tab == "FAQ":
|
|
341 |
The models on which this interface is based are Word Embedding models."
|
342 |
)
|
343 |
|
344 |
-
with st.expander(
|
345 |
st.write(
|
346 |
"The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
|
347 |
)
|
|
|
216 |
|
217 |
if graph_button:
|
218 |
time_slice_model = convert_time_name_to_model(time_slice)
|
219 |
+
nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
|
|
|
|
|
|
|
|
|
220 |
|
221 |
+
fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
|
222 |
|
223 |
+
# st.dataframe(df)
|
224 |
|
225 |
+
st.plotly_chart(fig)
|
226 |
|
|
|
227 |
|
228 |
|
229 |
|
|
|
326 |
## FAQ
|
327 |
""")
|
328 |
|
329 |
+
with st.expander('''**Which models is this interface based on?**'''):
|
330 |
st.write(
|
331 |
"This interface is based on five language models. \
|
332 |
Language models are statistical models of language, \
|
|
|
337 |
The models on which this interface is based are Word Embedding models."
|
338 |
)
|
339 |
|
340 |
+
with st.expander('''**Which corpus was used to train the models?**'''):
|
341 |
st.write(
|
342 |
"The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
|
343 |
)
|
plots.py
CHANGED
@@ -7,125 +7,37 @@ import pandas as pd
|
|
7 |
from word2vec import *
|
8 |
from sklearn.preprocessing import StandardScaler
|
9 |
import plotly.express as px
|
|
|
10 |
|
11 |
|
12 |
-
|
13 |
-
# def make_3d_plot(new_3d_vectors):
|
14 |
-
# """
|
15 |
-
# Turn DataFrame of 3D vectors into a 3D plot
|
16 |
-
# DataFrame structure: ['word', 'cosine_sim', '3d_vector']
|
17 |
-
# """
|
18 |
-
# fig = plt.figure()
|
19 |
-
# ax = fig.add_subplot(projection='3d')
|
20 |
-
|
21 |
-
# plt.ion()
|
22 |
-
|
23 |
-
# # Unpack vectors and labels from DataFrame
|
24 |
-
# labels = new_3d_vectors['word']
|
25 |
-
# x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
|
26 |
-
# y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
|
27 |
-
# z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])
|
28 |
-
|
29 |
-
# # Plot points
|
30 |
-
# ax.scatter(x, y, z)
|
31 |
-
|
32 |
-
# # Add labels
|
33 |
-
# for i, label in enumerate(labels):
|
34 |
-
# ax.text(x[i], y[i], z[i], label)
|
35 |
-
|
36 |
-
# # Set labels and title
|
37 |
-
# ax.set_xlabel('X')
|
38 |
-
# ax.set_ylabel('Y')
|
39 |
-
# ax.set_zlabel('Z')
|
40 |
-
# ax.set_title('3D plot of word vectors')
|
41 |
-
|
42 |
-
# return fig
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
# def make_3d_plot2(df):
|
48 |
-
# """
|
49 |
-
# Turn DataFrame of 3D vectors into a 3D plot using plotly
|
50 |
-
# DataFrame structure: ['word', 'cosine_sim', '3d_vector']
|
51 |
-
# """
|
52 |
-
# vectors = df['3d_vector'].tolist()
|
53 |
-
# fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
|
54 |
-
# return fig
|
55 |
-
|
56 |
-
|
57 |
-
# def make_3d_plot3(vectors_list, word, time_slice_model):
|
58 |
-
# """
|
59 |
-
# Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
|
60 |
-
# List structure: [(word, model_name, vector, cosine_sim)]
|
61 |
-
# """
|
62 |
-
# # Load model
|
63 |
-
# model = load_word2vec_model(f'models/{time_slice_model}.model')
|
64 |
-
|
65 |
-
# # Make UMAP model and fit it to the vectors
|
66 |
-
# umap_model = umap.UMAP(n_components=3)
|
67 |
-
# umap_model.fit(model.wv.vectors)
|
68 |
-
|
69 |
-
# # Transform the vectors to 3D
|
70 |
-
# transformed_vectors = umap_model.transform(model.wv.vectors)
|
71 |
-
|
72 |
-
|
73 |
-
# # Create DataFrame from the transformed vectors
|
74 |
-
# df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
|
75 |
-
|
76 |
-
# # Add word and cosine similarity to DataFrame
|
77 |
-
# df['word'] = model.wv.index_to_key
|
78 |
-
|
79 |
-
# # Filter the DataFrame for words in vectors_list and add cosine similarity
|
80 |
-
# word_list = [v[0] for v in vectors_list]
|
81 |
-
# cosine_sim_list = [v[3] for v in vectors_list]
|
82 |
-
|
83 |
-
# # Ensure that the word list and cosine similarity list are aligned properly
|
84 |
-
# df = df[df['word'].isin(word_list)]
|
85 |
-
# df['cosine_sim'] = cosine_sim_list
|
86 |
-
|
87 |
-
# # Create plot
|
88 |
-
# fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
|
89 |
-
# fig.update_traces(marker=dict(size=5))
|
90 |
-
# fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
|
91 |
-
|
92 |
-
# return fig, df
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
def make_3d_plot4(vectors_list, word, time_slice_model):
|
97 |
"""
|
98 |
-
|
99 |
-
|
100 |
"""
|
101 |
# Load model
|
102 |
model = load_word2vec_model(f'models/{time_slice_model}.model')
|
103 |
model_dict = model_dictionary(model)
|
104 |
|
105 |
-
|
106 |
# Extract vectors and names from model_dict
|
107 |
all_vector_names = list(model_dict.keys())
|
108 |
all_vectors = list(model_dict.values())
|
109 |
|
110 |
-
|
111 |
-
# Scale the vectors
|
112 |
scaler = StandardScaler()
|
113 |
vectors_scaled = scaler.fit_transform(all_vectors)
|
114 |
|
115 |
-
# Make
|
116 |
-
|
117 |
-
|
118 |
|
119 |
-
# Now umap_result contains the 3D representations of the vectors
|
120 |
# Associate the names with the 3D representations
|
121 |
-
result_with_names = [(all_vector_names[i],
|
122 |
-
|
123 |
|
124 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
125 |
result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
|
126 |
result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
|
127 |
|
128 |
-
|
129 |
# Create DataFrame from the transformed vectors
|
130 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
131 |
|
@@ -136,10 +48,10 @@ def make_3d_plot4(vectors_list, word, time_slice_model):
|
|
136 |
y = df['3d_vector'].apply(lambda v: v[1])
|
137 |
z = df['3d_vector'].apply(lambda v: v[2])
|
138 |
|
139 |
-
|
140 |
-
# Create plot
|
141 |
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
|
142 |
fig.update_traces(marker=dict(size=5))
|
143 |
fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
|
144 |
|
145 |
return fig, df
|
|
|
|
7 |
from word2vec import *
|
8 |
from sklearn.preprocessing import StandardScaler
|
9 |
import plotly.express as px
|
10 |
+
from sklearn.manifold import TSNE
|
11 |
|
12 |
|
13 |
+
def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"""
|
15 |
+
Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
|
16 |
+
List structure: [(word, model_name, vector, cosine_sim)]
|
17 |
"""
|
18 |
# Load model
|
19 |
model = load_word2vec_model(f'models/{time_slice_model}.model')
|
20 |
model_dict = model_dictionary(model)
|
21 |
|
|
|
22 |
# Extract vectors and names from model_dict
|
23 |
all_vector_names = list(model_dict.keys())
|
24 |
all_vectors = list(model_dict.values())
|
25 |
|
26 |
+
# Scale vectors
|
|
|
27 |
scaler = StandardScaler()
|
28 |
vectors_scaled = scaler.fit_transform(all_vectors)
|
29 |
|
30 |
+
# Make t-SNE model and fit it to the scaled vectors
|
31 |
+
tsne_model = TSNE(n_components=3, random_state=0)
|
32 |
+
tsne_result = tsne_model.fit_transform(vectors_scaled)
|
33 |
|
|
|
34 |
# Associate the names with the 3D representations
|
35 |
+
result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
|
|
|
36 |
|
37 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
38 |
result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
|
39 |
result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
|
40 |
|
|
|
41 |
# Create DataFrame from the transformed vectors
|
42 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
43 |
|
|
|
48 |
y = df['3d_vector'].apply(lambda v: v[1])
|
49 |
z = df['3d_vector'].apply(lambda v: v[2])
|
50 |
|
51 |
+
# Plot
|
|
|
52 |
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
|
53 |
fig.update_traces(marker=dict(size=5))
|
54 |
fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
|
55 |
|
56 |
return fig, df
|
57 |
+
|