Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on May 20, 2024

Commit

05fa263

1 Parent(s): ca2c3e1

used tSNE to reduce dimensions for 3d plot

Browse files

Files changed (2) hide show

app.py +6 -10
plots.py +11 -99

app.py CHANGED Viewed

@@ -216,18 +216,14 @@ elif active_tab == "3D graph":
         if graph_button:
             time_slice_model = convert_time_name_to_model(time_slice)
-            nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
-            # nearest_neighbours_3d_vectors = create_3d_vectors(word, time_slice_model, nearest_neighbours_vectors)
-            st.dataframe(nearest_neighbours_vectors)
-            # new_3d_vectors = nearest_neighbours_to_pca_vectors(word, time_slice, nearest_neighbours_vectors)
-            # st.dataframe(new_3d_vectors)
-            fig, df = make_3d_plot4(nearest_neighbours_vectors, word, time_slice_model)
-            st.dataframe(df)
-            st.plotly_chart(fig)
@@ -330,7 +326,7 @@ elif active_tab == "FAQ":
         ## FAQ
         """)
-    with st.expander("Which models is this interface based on?"):
         st.write(
                 "This interface is based on five language models. \
                 Language models are statistical models of language, \
@@ -341,7 +337,7 @@ elif active_tab == "FAQ":
                 The models on which this interface is based are Word Embedding models."
                 )
-    with st.expander("Which corpus was used to train the models?"):
         st.write(
             "The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
         )

         if graph_button:
             time_slice_model = convert_time_name_to_model(time_slice)
+            nearest_neighbours_vectors = get_nearest_neighbours_vectors(word, time_slice_model, n)
+            fig, df = make_3d_plot_tSNE(nearest_neighbours_vectors, word, time_slice_model)
+            # st.dataframe(df)
+            st.plotly_chart(fig)
         ## FAQ
         """)
+    with st.expander('''**Which models is this interface based on?**'''):
         st.write(
                 "This interface is based on five language models. \
                 Language models are statistical models of language, \
                 The models on which this interface is based are Word Embedding models."
                 )
+    with st.expander('''**Which corpus was used to train the models?**'''):
         st.write(
             "The five models on which this interface is based were trained on five slices of the Diorisis Ancient Greek Corpus (Vatri & McGillivray 2018)."
         )

plots.py CHANGED Viewed

@@ -7,125 +7,37 @@ import pandas as pd
 from word2vec import *
 from sklearn.preprocessing import StandardScaler
 import plotly.express as px
-# def make_3d_plot(new_3d_vectors):
-#     """
-#     Turn DataFrame of 3D vectors into a 3D plot
-#     DataFrame structure: ['word', 'cosine_sim', '3d_vector']
-#     """
-#     fig = plt.figure()
-#     ax = fig.add_subplot(projection='3d')
-#     plt.ion()
-#     # Unpack vectors and labels from DataFrame
-#     labels = new_3d_vectors['word']
-#     x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
-#     y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
-#     z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])
-#     # Plot points
-#     ax.scatter(x, y, z)
-#     # Add labels
-#     for i, label in enumerate(labels):
-#         ax.text(x[i], y[i], z[i], label)
-#     # Set labels and title
-#     ax.set_xlabel('X')
-#     ax.set_ylabel('Y')
-#     ax.set_zlabel('Z')
-#     ax.set_title('3D plot of word vectors')
-#     return fig
-# def make_3d_plot2(df):
-#     """
-#         Turn DataFrame of 3D vectors into a 3D plot using plotly
-#         DataFrame structure: ['word', 'cosine_sim', '3d_vector']
-#     """
-#     vectors = df['3d_vector'].tolist()
-#     fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
-#     return fig
-# def make_3d_plot3(vectors_list, word, time_slice_model):
-#     """
-#     Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
-#     List structure: [(word, model_name, vector, cosine_sim)]
-#     """
-#     # Load model
-#     model = load_word2vec_model(f'models/{time_slice_model}.model')
-#     # Make UMAP model and fit it to the vectors
-#     umap_model = umap.UMAP(n_components=3)
-#     umap_model.fit(model.wv.vectors)
-#     # Transform the vectors to 3D
-#     transformed_vectors = umap_model.transform(model.wv.vectors)
-#     # Create DataFrame from the transformed vectors
-#     df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
-#     # Add word and cosine similarity to DataFrame
-#     df['word'] = model.wv.index_to_key
-#     # Filter the DataFrame for words in vectors_list and add cosine similarity
-#     word_list = [v[0] for v in vectors_list]
-#     cosine_sim_list = [v[3] for v in vectors_list]
-#     # Ensure that the word list and cosine similarity list are aligned properly
-#     df = df[df['word'].isin(word_list)]
-#     df['cosine_sim'] = cosine_sim_list
-#     # Create plot
-#     fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
-#     fig.update_traces(marker=dict(size=5))
-#     fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
-#     return fig, df
-def make_3d_plot4(vectors_list, word, time_slice_model):
     """
-    Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
-    List structure: [(word, model_name, vector, cosine_sim)]
     """
     # Load model
     model = load_word2vec_model(f'models/{time_slice_model}.model')
     model_dict = model_dictionary(model)
     # Extract vectors and names from model_dict
     all_vector_names = list(model_dict.keys())
     all_vectors = list(model_dict.values())
-    # Scale the vectors
     scaler = StandardScaler()
     vectors_scaled = scaler.fit_transform(all_vectors)
-    # Make UMAP model and fit it to the scaled vectors
-    umap_model = umap.UMAP(n_components=3)
-    umap_result = umap_model.fit_transform(vectors_scaled)
-    # Now umap_result contains the 3D representations of the vectors
     # Associate the names with the 3D representations
-    result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]
     # Only keep the vectors that are in vectors_list and their cosine similarities
     result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
     result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
     # Create DataFrame from the transformed vectors
     df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
@@ -136,10 +48,10 @@ def make_3d_plot4(vectors_list, word, time_slice_model):
     y = df['3d_vector'].apply(lambda v: v[1])
     z = df['3d_vector'].apply(lambda v: v[2])
-    # Create plot
     fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
     fig.update_traces(marker=dict(size=5))
     fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
     return fig, df

 from word2vec import *
 from sklearn.preprocessing import StandardScaler
 import plotly.express as px
+from sklearn.manifold import TSNE
+def make_3d_plot_tSNE(vectors_list, word, time_slice_model):
     """
+        Turn list of 100D vectors into a 3D plot using t-SNE and Plotly.
+        List structure: [(word, model_name, vector, cosine_sim)]
     """
     # Load model
     model = load_word2vec_model(f'models/{time_slice_model}.model')
     model_dict = model_dictionary(model)
     # Extract vectors and names from model_dict
     all_vector_names = list(model_dict.keys())
     all_vectors = list(model_dict.values())
+    # Scale vectors
     scaler = StandardScaler()
     vectors_scaled = scaler.fit_transform(all_vectors)
+    # Make t-SNE model and fit it to the scaled vectors
+    tsne_model = TSNE(n_components=3, random_state=0)
+    tsne_result = tsne_model.fit_transform(vectors_scaled)
     # Associate the names with the 3D representations
+    result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
     # Only keep the vectors that are in vectors_list and their cosine similarities
     result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
     result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
     # Create DataFrame from the transformed vectors
     df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
     y = df['3d_vector'].apply(lambda v: v[1])
     z = df['3d_vector'].apply(lambda v: v[2])
+    # Plot
     fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
     fig.update_traces(marker=dict(size=5))
     fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
     return fig, df