edia_we_es / modules /model_embbeding.py
LMartinezEXEX's picture
Using data_loader to load vector files instead of json.
785b2ef
raw
history blame
2.92 kB
import os
import operator
import numpy as np
import pandas as pd
from numpy import dot
from gensim import matutils
from modules.module_ann import Ann
from memory_profiler import profile
from sklearn.neighbors import NearestNeighbors
from data.data_loader import load_embeddings
class Embedding:
@profile
def __init__(self, path, binary, limit = None, randomizedPCA = False):
# Dataset info
self.path = path
# Pandas dataset
self.ds = None
# All Words embedding List[List[float]]
self.embedding = None
# Estimate AproximateNearestNeighbors
self.ann = None
# Load embedding and pca dataset
self.__load(binary, limit, randomizedPCA)
def __contains__(self, word):
return word in self.ds['word'].to_list()
def __load(self, binary, limit, randomizedPCA):
print(f"Preparing {os.path.basename(self.path)} embeddings...")
# --- Prepare dataset ---
self.ds = load_embeddings(self.path, binary, randomizedPCA, limit)
# --- Get embedding from string
self.embedding = self.ds['embedding'].to_list()
# --- Get forest tree to estimate Nearest Neighbors ---
self.ann = Ann(
words=self.ds['word'],
vectors=self.ds['embedding'],
coord=self.ds['pca']
)
self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
# --- Fit Sklearn NN method ---
self.neigh = NearestNeighbors(n_neighbors=20)
self.neigh.fit(self.embedding)
def __getValue(self, word, feature):
word_id, value = None, None
if word in self:
word_id = self.ds['word'].to_list().index(word)
if word_id != None:
value = self.ds[feature].to_list()[word_id]
return value
def getEmbedding(self, word):
return self.__getValue(word, 'embedding')
def getPCA(self, word):
return self.__getValue(word, 'pca')
def cosineSimilarities(self, vector_1, vectors_all):
norm = np.linalg.norm(vector_1)
all_norms = np.linalg.norm(vectors_all, axis=1)
dot_products = dot(vectors_all, vector_1)
similarities = dot_products / (norm * all_norms)
return similarities
def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
if nn_method == 'ann':
words = self.ann.get(word, n_neighbors)
elif nn_method == 'sklearn':
word_emb = self.getEmbedding(word)
neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
words = operator.itemgetter(*neighbors)(self.ds['word'])
else:
words = []
return words
def getCosineSimilarities(self, w1, w2):
return dot(
matutils.unitvec(self.getEmbedding(w1)),
matutils.unitvec(self.getEmbedding(w2))
)