Spaces:
Configuration error
Configuration error
File size: 2,923 Bytes
785b2ef a779273 785b2ef a779273 785b2ef a779273 785b2ef a779273 785b2ef a779273 785b2ef a779273 785b2ef a779273 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import os
import operator
import numpy as np
import pandas as pd
from numpy import dot
from gensim import matutils
from modules.module_ann import Ann
from memory_profiler import profile
from sklearn.neighbors import NearestNeighbors
from data.data_loader import load_embeddings
class Embedding:
@profile
def __init__(self, path, binary, limit = None, randomizedPCA = False):
# Dataset info
self.path = path
# Pandas dataset
self.ds = None
# All Words embedding List[List[float]]
self.embedding = None
# Estimate AproximateNearestNeighbors
self.ann = None
# Load embedding and pca dataset
self.__load(binary, limit, randomizedPCA)
def __contains__(self, word):
return word in self.ds['word'].to_list()
def __load(self, binary, limit, randomizedPCA):
print(f"Preparing {os.path.basename(self.path)} embeddings...")
# --- Prepare dataset ---
self.ds = load_embeddings(self.path, binary, randomizedPCA, limit)
# --- Get embedding from string
self.embedding = self.ds['embedding'].to_list()
# --- Get forest tree to estimate Nearest Neighbors ---
self.ann = Ann(
words=self.ds['word'],
vectors=self.ds['embedding'],
coord=self.ds['pca']
)
self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
# --- Fit Sklearn NN method ---
self.neigh = NearestNeighbors(n_neighbors=20)
self.neigh.fit(self.embedding)
def __getValue(self, word, feature):
word_id, value = None, None
if word in self:
word_id = self.ds['word'].to_list().index(word)
if word_id != None:
value = self.ds[feature].to_list()[word_id]
return value
def getEmbedding(self, word):
return self.__getValue(word, 'embedding')
def getPCA(self, word):
return self.__getValue(word, 'pca')
def cosineSimilarities(self, vector_1, vectors_all):
norm = np.linalg.norm(vector_1)
all_norms = np.linalg.norm(vectors_all, axis=1)
dot_products = dot(vectors_all, vector_1)
similarities = dot_products / (norm * all_norms)
return similarities
def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
if nn_method == 'ann':
words = self.ann.get(word, n_neighbors)
elif nn_method == 'sklearn':
word_emb = self.getEmbedding(word)
neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
words = operator.itemgetter(*neighbors)(self.ds['word'])
else:
words = []
return words
def getCosineSimilarities(self, w1, w2):
return dot(
matutils.unitvec(self.getEmbedding(w1)),
matutils.unitvec(self.getEmbedding(w2))
) |