import re import string import nltk nltk.download('stopwords') arabic_stopwords = set(nltk.corpus.stopwords.words("arabic")) arabic_diacritics = re.compile(""" ّ | # Tashdid َ | # Fatha ً | # Tanwin Fath ُ | # Damma ٌ | # Tanwin Damm ِ | # Kasra ٍ | # Tanwin Kasr ْ | # Sukun ـ # Tatwil/Kashida """, re.VERBOSE) arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' english_punctuations = string.punctuation punctuations = arabic_punctuations + english_punctuations def remove_urls (text): text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) return text def remove_emails(text): text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "", text, flags=re.MULTILINE) return text # def remove_emoji(text): # return emoji.get_emoji_regexp().sub(u'', text) def remove_emoji(data): emoj = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", re.UNICODE) return re.sub(emoj, '', data) def normalization(text): text = re.sub("[إأآا]", "ا", text) text = re.sub("ى", "ي", text) text = re.sub("ؤ", "ء", text) text = re.sub("ئ", "ء", text) text = re.sub("ة", "ه", text) text = re.sub("گ", "ك", text) return text def remove_diacritics(text): text = re.sub(arabic_diacritics, '', text) return text def remove_stopwords(text): filtered_sentence = [w for w in text.split() if not w in arabic_stopwords] return ' '.join(filtered_sentence) def cleaning_content(line): if (isinstance(line, float)): return None line.replace('\n', ' ') line = remove_emails(line) line = remove_urls(line) line = remove_emoji(line) nline = [w if '@' not in w else 'USERID' for w in line.split()] line = ' '.join(nline) line = line.replace('RT', '').replace('', '').replace('
','').replace('"', '').replace('', '').replace('USERID', '') # add spaces between punc, line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations})) # then remove punc, translator = str.maketrans('', '', punctuations) line = line.translate(translator) line = remove_stopwords(line) line=remove_diacritics(normalization(line)) line = line.strip() return line def hasDigits(s): return any( 48 <= ord(char) <= 57 or 1632 <= ord(char) <= 1641 for char in s)