harir's picture
add parser.py
8cb156b verified
raw
history blame
1.03 kB
import re
import random
import string
import nltk
def replacement1(review, regex_list):
replaced_dict = {}
for regex in regex_list:
matches = re.findall(regex, review, re.IGNORECASE)
for match in matches:
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
review = review.replace(match, random_string)
replaced_dict[random_string] = match
return review, replaced_dict
def replacement2(sentences, replaced_dict):
for i in range(len(sentences)):
for randomized, original in replaced_dict.items():
sentences[i] = sentences[i].replace(randomized, original)
return sentences
def parse_sentences(review):
regex_list = [r'et al.', r'"(.*?)"', r"'(.*?)'", r'e.g.', r'Sec.', r'Sec \d+(\.\d+)?\.', r'w.r.t.', r'e.q', r'fig.']
review, replaced_dict = replacement1(review, regex_list)
sentences = nltk.sent_tokenize(review)
sentences = replacement2(sentences, replaced_dict)
return sentences