import re import random import string import nltk nltk.download('punkt') def replacement1(review, regex_list): replaced_dict = {} for regex in regex_list: matches = re.findall(regex, review, re.IGNORECASE) for match in matches: random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=10)) review = review.replace(match, random_string) replaced_dict[random_string] = match return review, replaced_dict def replacement2(sentences, replaced_dict): for i in range(len(sentences)): for randomized, original in replaced_dict.items(): sentences[i] = sentences[i].replace(randomized, original) return sentences def parse_sentences(review): regex_list = [r'et al.', r'"(.*?)"', r"'(.*?)'", r'e.g.', r'Sec.', r'Sec \d+(\.\d+)?\.', r'w.r.t.', r'e.q', r'fig.'] review, replaced_dict = replacement1(review, regex_list) sentences = nltk.sent_tokenize(review) sentences = replacement2(sentences, replaced_dict) return sentences