import pandas as pd from sklearn.decomposition import PCA from gensim.models import KeyedVectors def load_embeddings(path, binary = False, randomPCA = False, limit = None): if randomPCA: pca = PCA(n_components=2, copy=False, whiten=False, svd_solver='randomized', iterated_power='auto' ) else: pca = PCA(n_components=2) model = KeyedVectors.load_word2vec_format(path, binary=binary, limit=limit) # Cased Vocab cased_words = model.vocab.keys() #Normalized vectors model.init_sims(replace=True) cased_emb = [model[word] for word in cased_words] # PCA reduction cased_pca = pca.fit_transform(cased_emb) df_cased = pd.DataFrame( zip( cased_words, cased_emb, cased_pca ), columns=['word', 'embedding', 'pca'] ) df_cased['word'] = df_cased.word.apply(lambda w: w.lower()) df_uncased = df_cased.drop_duplicates(subset='word') df_uncased.to_json(path[:-3] + 'json') load_embeddings('./wiki-news-300d-1M.vec', limit=10000)