File size: 4,985 Bytes
b3f3132 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import torch
from torch.utils.data import Dataset
import logging
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pickle
import string
from abc import abstractmethod
import json
class AbstractMoviesRanker:
"""Abstract class for ranking items"""
def __init__(self, df, index_matrix, score_name = "score"):
self.df = df
self.ids = self.df.index.values
self.index_matrix = index_matrix
self.score_name = score_name
@abstractmethod
def encode_query(self, query):
pass
def get_scores(self, encoded_query):
return torch.mm(encoded_query, self.index_matrix.transpose(0,1))[0].tolist()
def get_top_ids(self, scores, topn=6):
ids_scores_pairs = list(zip(self.ids.tolist(), scores))
ids_scores_pairs = sorted(ids_scores_pairs, key = lambda x:x[1], reverse = True)
sorted_ids = [v[0] for v in ids_scores_pairs]
sorted_scores = [v[1] for v in ids_scores_pairs]
sorted_df = self.df.loc[sorted_ids[:topn], :]
sorted_df.loc[:,self.score_name] = sorted_scores[:topn]
return sorted_df
def run_query(self, query, topn=6):
encoded_query = self.encode_query(query)
scores = self.get_scores(encoded_query)
return self.get_top_ids(scores, topn)
depunctuate = staticmethod(lambda x: x.translate(str.maketrans('','',string.punctuation)))
class SparseTfIdfRanker(AbstractMoviesRanker):
"""Sparse Ranking via TF iDF"""
def __init__(self, df, index_matrix, vectorizer_path):
super(SparseTfIdfRanker, self).__init__(df, index_matrix, score_name = 'tfidf-score')
self.vectorizer = pickle.load(open(vectorizer_path, 'rb'))
self.index_matrix = self.index_matrix.to_dense() ##For dot products
def encode_query(self, query):
encoded_query = torch.tensor(self.vectorizer.transform([self.depunctuate(query)]).todense(), dtype = torch.float32)
return F.normalize(encoded_query, p=2)
class BertRanker(AbstractMoviesRanker):
"""Dense Ranking with embedding matrix"""
def __init__(self, df, index_matrix, modelpath):
super(BertRanker, self).__init__(df, index_matrix, score_name = "bert-score")
self.tokenizer = AutoTokenizer.from_pretrained(modelpath)
self.model = AutoModel.from_pretrained(modelpath)
def encode_query(self, query):
tok_q = self.tokenizer(query, return_tensors="pt", padding="max_length", max_length = 128, truncation=True)
o = self.model(**tok_q)
encoded_query = self.mean_pooling(o, tok_q['attention_mask'])
return F.normalize(encoded_query, p=2)
@staticmethod
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
class SparseDenseMoviesRanker():
"""Sparse Ranking via TF iDF, filtering a first rank, then dense ranking on these items"""
def __init__(self, df, modelpath, bert_index, sparse_index, vectorizer_path):
self.df =df
self.ids = self.df.index.values
self.tfidf_engine = SparseTfIdfRanker(df, sparse_index, vectorizer_path)
self.modelpath = modelpath
self.bert_index = bert_index
def run_query(self, query, topn=6, first_ranking=1000):
tfidf_sorted_frame = self.tfidf_engine.run_query(query, topn=first_ranking)
firstranking_index = self.bert_index[tfidf_sorted_frame.index.values]
self.bert_engine = BertRanker(tfidf_sorted_frame, firstranking_index, self.modelpath)
bert_sorted_frame = self.bert_engine.run_query(query, topn=topn)
return bert_sorted_frame
@classmethod
def from_json_config(cls, jsonfile):
with open(jsonfile) as fp:
conf = json.loads(fp.read())
##Load data for ranking
df = pd.read_pickle(conf['dataframe'])
##Load indices, e.g. embeddings and encoding utilities
bert_index = torch.load(conf['bert_index'])
sparse_index = torch.load(conf['sparse_index'])
vectorizer_path = conf['vectorizer_path']
modelpath = conf['modelpath']
##Conf for first ranking
firstranking = conf.get('firstranking', 100)
ranker = cls(df, modelpath, bert_index, sparse_index, vectorizer_path)
return ranker
if __name__=='__main__':
engine = SparseDenseMoviesRanker.from_json_config('conf.json')
for query in ["une histoire de pirates et de chasse au trésor", "une histoire de gangsters avec de l'argent"]:
print(query)
final_df = engine.run_query(query)
print(final_df.head())
|