Spaces:
Configuration error
Configuration error
import os | |
import operator | |
import numpy as np | |
import pandas as pd | |
from numpy import dot | |
from gensim import matutils | |
from modules.module_ann import Ann | |
from memory_profiler import profile | |
from sklearn.neighbors import NearestNeighbors | |
from data.data_loader import load_embeddings | |
class Embedding: | |
def __init__(self, path, binary, limit = None, randomizedPCA = False): | |
# Dataset info | |
self.path = path | |
# Pandas dataset | |
self.ds = None | |
# All Words embedding List[List[float]] | |
self.embedding = None | |
# Estimate AproximateNearestNeighbors | |
self.ann = None | |
# Load embedding and pca dataset | |
self.__load(binary, limit, randomizedPCA) | |
def __contains__(self, word): | |
return word in self.ds['word'].to_list() | |
def __load(self, binary, limit, randomizedPCA): | |
print(f"Preparing {os.path.basename(self.path)} embeddings...") | |
# --- Prepare dataset --- | |
self.ds = load_embeddings(self.path, binary, randomizedPCA, limit) | |
# --- Get embedding from string | |
self.embedding = self.ds['embedding'].to_list() | |
# --- Get forest tree to estimate Nearest Neighbors --- | |
self.ann = Ann( | |
words=self.ds['word'], | |
vectors=self.ds['embedding'], | |
coord=self.ds['pca'] | |
) | |
self.ann.init(n_trees=20, metric='dot', n_jobs=-1) | |
# --- Fit Sklearn NN method --- | |
self.neigh = NearestNeighbors(n_neighbors=20) | |
self.neigh.fit(self.embedding) | |
def __getValue(self, word, feature): | |
word_id, value = None, None | |
if word in self: | |
word_id = self.ds['word'].to_list().index(word) | |
if word_id != None: | |
value = self.ds[feature].to_list()[word_id] | |
return value | |
def getEmbedding(self, word): | |
return self.__getValue(word, 'embedding') | |
def getPCA(self, word): | |
return self.__getValue(word, 'pca') | |
def cosineSimilarities(self, vector_1, vectors_all): | |
norm = np.linalg.norm(vector_1) | |
all_norms = np.linalg.norm(vectors_all, axis=1) | |
dot_products = dot(vectors_all, vector_1) | |
similarities = dot_products / (norm * all_norms) | |
return similarities | |
def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'): | |
if nn_method == 'ann': | |
words = self.ann.get(word, n_neighbors) | |
elif nn_method == 'sklearn': | |
word_emb = self.getEmbedding(word) | |
neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0] | |
words = operator.itemgetter(*neighbors)(self.ds['word']) | |
else: | |
words = [] | |
return words | |
def getCosineSimilarities(self, w1, w2): | |
return dot( | |
matutils.unitvec(self.getEmbedding(w1)), | |
matutils.unitvec(self.getEmbedding(w2)) | |
) |