Spaces:

vialibre
/

edia_we_es

Configuration error

App Files Files Community

edia_we_es / modules /model_embbeding.py

LMartinezEXEX

Using data_loader to load vector files instead of json.

785b2ef about 2 years ago

raw

history blame

2.92 kB

	import os
	import operator
	import numpy as np
	import pandas as pd
	from numpy import dot
	from gensim import matutils
	from modules.module_ann import Ann
	from memory_profiler import profile
	from sklearn.neighbors import NearestNeighbors
	from data.data_loader import load_embeddings


	class Embedding:
	@profile
	def __init__(self, path, binary, limit = None, randomizedPCA = False):
	# Dataset info
	self.path = path

	# Pandas dataset
	self.ds = None

	# All Words embedding List[List[float]]
	self.embedding = None

	# Estimate AproximateNearestNeighbors
	self.ann = None

	# Load embedding and pca dataset
	self.__load(binary, limit, randomizedPCA)

	def __contains__(self, word):
	return word in self.ds['word'].to_list()

	def __load(self, binary, limit, randomizedPCA):
	print(f"Preparing {os.path.basename(self.path)} embeddings...")

	# --- Prepare dataset ---
	self.ds = load_embeddings(self.path, binary, randomizedPCA, limit)

	# --- Get embedding from string
	self.embedding = self.ds['embedding'].to_list()

	# --- Get forest tree to estimate Nearest Neighbors ---
	self.ann = Ann(
	words=self.ds['word'],
	vectors=self.ds['embedding'],
	coord=self.ds['pca']
	)
	self.ann.init(n_trees=20, metric='dot', n_jobs=-1)

	# --- Fit Sklearn NN method ---
	self.neigh = NearestNeighbors(n_neighbors=20)
	self.neigh.fit(self.embedding)

	def __getValue(self, word, feature):
	word_id, value = None, None

	if word in self:
	word_id = self.ds['word'].to_list().index(word)

	if word_id != None:
	value = self.ds[feature].to_list()[word_id]

	return value

	def getEmbedding(self, word):
	return self.__getValue(word, 'embedding')

	def getPCA(self, word):
	return self.__getValue(word, 'pca')

	def cosineSimilarities(self, vector_1, vectors_all):
	norm = np.linalg.norm(vector_1)
	all_norms = np.linalg.norm(vectors_all, axis=1)
	dot_products = dot(vectors_all, vector_1)
	similarities = dot_products / (norm * all_norms)
	return similarities

	def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
	if nn_method == 'ann':
	words = self.ann.get(word, n_neighbors)
	elif nn_method == 'sklearn':
	word_emb = self.getEmbedding(word)
	neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
	words = operator.itemgetter(*neighbors)(self.ds['word'])
	else:
	words = []
	return words

	def getCosineSimilarities(self, w1, w2):
	return dot(
	matutils.unitvec(self.getEmbedding(w1)),
	matutils.unitvec(self.getEmbedding(w2))
	)