File size: 6,102 Bytes

4839ed5

# -*- coding: utf-8 -*-
import re
from snowballstemmer import stemmer
import arabicstopwords.arabicstopwords as stp
from tqdm import tqdm
import pandas as pd
import arabicstopwords.arabicstopwords as ar_stp
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from snowballstemmer import stemmer
from nltk.stem import PorterStemmer
import string
import logging
import global_variables as gb


ar_stemmer = stemmer("arabic")
porter= PorterStemmer()

# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
    return df


def remove_punctuation(text):
    # Removing punctuations in string using regex
    text = re.sub(r'[^\w\s]', '', text)
    return text


    #a function to normalize the tweets
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)


def remove_punctuations_tashkeel(text):
    """
    The input should be arabic string
    """
    punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ""" + string.punctuation

    arabic_diacritics = re.compile(
        """
                                ّ    | # Shadda
                                َ    | # Fatha
                                ً    | # Tanwin Fath
                                ُ    | # Damma
                                ٌ    | # Tanwin Damm
                                ِ    | # Kasra
                                ٍ    | # Tanwin Kasr
                                ْ    | # Sukun
                                ـ     # Tatwil/Kashida
                         """,
        re.VERBOSE,
    )

    # remove_punctuations
    translator = str.maketrans("", "", punctuations)
    text = text.translate(translator)

    # remove Tashkeel
    text = re.sub(arabic_diacritics, "", text)

    return text


def remove_longation(text):
    # remove longation
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text


def remove_harakaat(text):
    # harakaat and tatweel (kashida) to remove
    accents = re.compile(r"[\u064b-\u0652\u0640]")

    # Keep only Arabic letters/do not remove number
    arabic_punc = re.compile(r"[\u0621-\u063A\u0641-\u064A\d+]+")
    text = " ".join(arabic_punc.findall(accents.sub("", text)))
    text = text.strip()
    return text


#removing stop sords function
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set(ar_stp.stopwords_list())
    for term in sentence.split() : 
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)

    
def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


    #removing stop sords function
def en_remove_stop_words(sentence):
    terms=[]
    stop_words= set(stopwords.words('english'))
    words = sentence.split()
    for term in words: 
        if term not in stop_words :
            terms.append(term)
    return " ".join(terms)


def en_stem(sentence):
    token_words=word_tokenize(sentence)
    return " ".join([porter.stem(word) for word in token_words])



def clean(text):
    '''
    Clean input text form urls, handles, tabs, line jumps, and extra white spaces
    '''
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"RT ", " ", text)  # remove rt
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text)# remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = text.strip()

    text = remove_emoji_smileys(text)
    return text


def remove_emoji_smileys(text):
    try:
        # UCS-4
        EMOJIS_PATTERN = re.compile(
            u"([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])"
        )
    except re.error:
        # UCS-2
        EMOJIS_PATTERN = re.compile(
            u"([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])"
        )

    SMILEYS_PATTERN = re.compile(r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}", re.IGNORECASE)

    text = SMILEYS_PATTERN.sub(r"", text)
    text = EMOJIS_PATTERN.sub(r"", text)
    return text





def preprocess_english(sentence):
    # apply preprocessing steps on the given sentence
    sentence = sentence.lower()
    sentence = en_remove_stop_words(sentence)
    sentence = en_stem(sentence)
    return sentence



def preprocess_arabic(sentence): # for Arabic
    # apply preprocessing steps on the given sentence
    sentence = normalize_arabic(sentence)
    sentence = ar_remove_stop_words(sentence)
    sentence = ar_stem(sentence)
    return sentence


def preprocess(query, lang):

    query = clean(query)
    query = remove_punctuation(query)

    if lang == "en":
        return preprocess_english(query)
    else:
        return preprocess_arabic(query)



def initailize_logger(logger, log_file, level):
    
    if not len(logger.handlers): # avoid creating more than one handler
        formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
        fileHandler = logging.FileHandler(log_file)
        fileHandler.setFormatter(formatter)
        streamHandler = logging.StreamHandler()
        streamHandler.setFormatter(formatter)
        logger.setLevel(level)
        logger.addHandler(fileHandler)
        logger.addHandler(streamHandler)

    return logger