GrammarGuru / src /models /gensim_vect_v2.py
lewispons's picture
Initial Setup
fc07932
raw
history blame contribute delete
3.71 kB
import pandas as pd
from gensim import corpora
from gensim import similarities
from gensim.models import TfidfModel
from gensim.parsing import strip_tags, strip_numeric, \
strip_multiple_whitespaces, stem_text, strip_punctuation, \
remove_stopwords, preprocess_string
import re
from typing import List
from utils.constants import TEST_INPUTS
import argparse
from random import choice
transform_to_lower = lambda s: s.lower()
remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)
class PaperRecommender:
def __init__(self,
num_samples=3000,
corpus_dictionary_path="30Ktokens",
arxiv_dataset_path="/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip",
save_dict=False,
query=""):
self.num_samples = num_samples
self.corpus_dictionary_path = corpus_dictionary_path
self.arxiv_dataset_path = arxiv_dataset_path
self.save_dict = save_dict
self.query = query
self.cleaning_filters = [
strip_tags,
strip_numeric,
strip_punctuation,
strip_multiple_whitespaces,
transform_to_lower,
remove_stopwords,
remove_single_char
]
self.dictionary = None
self.index = None
self.tfidf_model = None
self.df = None
def gensim_tokenizer(self, docs: List[str]):
tokenized_docs = list()
for doc in docs:
processed_words = preprocess_string(doc, self.cleaning_filters)
tokenized_docs.append(processed_words)
return tokenized_docs
def cleaning_pipe(self, document):
processed_words = preprocess_string(document, self.cleaning_filters)
return processed_words
def get_gensim_dictionary(self, tokenized_docs: List[str], dict_name: str = "corpus"):
dictionary = corpora.Dictionary(tokenized_docs)
if self.save_dict:
parent_folder = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries"
dictionary.save(f'{parent_folder}/{dict_name}.dict')
return dictionary
def get_closest_n(self, query: str, n: int):
query_document = self.cleaning_pipe(query)
query_bow = self.dictionary.doc2bow(query_document)
sims = self.index[self.tfidf_model[query_bow]]
top_idx = sims.argsort()[-1 * n:][::-1]
return top_idx
def get_recommendations_metadata(self, query: str, n: int):
recommendations_idxs = self.get_closest_n(query, n)
recommendations_metadata = self.df.iloc[recommendations_idxs]
recommendations_metadata = recommendations_metadata.reset_index(drop=True)
return recommendations_metadata
def run_recommender(self):
if self.num_samples is None:
self.df = pd.read_parquet(self.arxiv_dataset_path)
self.df = pd.read_parquet(self.arxiv_dataset_path).sample(self.num_samples).reset_index(drop=True)
corpus = self.df['cleaned_abstracts'].to_list()
tokenized_corpus = self.gensim_tokenizer(corpus)
self.dictionary = self.get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=self.corpus_dictionary_path)
BoW_corpus = [self.dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus]
self.tfidf_model = TfidfModel(BoW_corpus)
self.index = similarities.SparseMatrixSimilarity(self.tfidf_model[BoW_corpus], num_features=len(self.dictionary))
if self.query is None:
self.query = choice(TEST_INPUTS)
return self.results