|
from nltk.stem.isri import ISRIStemmer |
|
from pyarabic.araby import strip_tashkeel, strip_tatweel |
|
import numpy as np |
|
import pandas as pd |
|
import json |
|
import re |
|
import time |
|
import os |
|
import math |
|
import random |
|
|
|
isristemmer = ISRIStemmer() |
|
def stemming(txt): |
|
return isristemmer.stem(txt) |
|
|
|
|
|
def remove_singleCharacter(text): |
|
text_tokenized = ar.tokenize(text) |
|
clean_txt = '' |
|
for word in text_tokenized: |
|
if len(word) != 1: |
|
clean_txt = clean_txt + word + ' ' |
|
|
|
return clean_txt[:-1] |
|
|
|
|
|
def remove_punctuations(text): |
|
punc = '''()-[]{};:'"\,<>./@#$%^&*،؛_~''' |
|
arabic_punctuations = '''`÷×؛_ـ،/:".,'~¦+|”…“–ـ=﴾﴿ ﹱ ﹹ ⸀˓• ב''' |
|
punctuations_list = punc + arabic_punctuations |
|
for x in punctuations_list: |
|
text = text.replace(x, ' ') |
|
return text |
|
|
|
|
|
def normalize_text(txt): |
|
txt = strip_tashkeel(txt) |
|
txt = strip_tatweel(txt) |
|
txt = ''.join(txt[i] for i in range(len(txt)) if i == |
|
0 or txt[i-1] != txt[i]) |
|
return txt |
|
|
|
|
|
def remove_stopwords(txt, path="stopword.txt"): |
|
text_tokenized = txt.split(' ') |
|
clean_txt = '' |
|
|
|
|
|
arabic_stop_words_file = open(path, 'r', encoding='utf-8') |
|
arabic_stop_words = arabic_stop_words_file.read().split('\n') |
|
for word in text_tokenized: |
|
if word not in arabic_stop_words: |
|
clean_txt = clean_txt + word + ' ' |
|
|
|
return clean_txt[:-1] |
|
|
|
|
|
def Remove_unwanted(text): |
|
|
|
|
|
text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) |
|
text = re.sub(r'^http?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) |
|
text = re.sub(r"http\S+", " ", text) |
|
text = re.sub(r"https\S+", " ", text) |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'[a-zA-Z]+', ' ', text) |
|
text = re.sub(r"^\s+|\s+$", "", text) |
|
text = re.sub(r"(\s\d+)", " ", text) |
|
text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", " ", text) |
|
text = re.sub(r"\d+", " ", text) |
|
text = re.sub(r'[إأٱآا]', 'ا', text) |
|
text = re.sub(r'ى', '[ي]', text) |
|
text = re.sub(r'ء', '[ؤئ]', text) |
|
text = re.sub(r' +', ' ', text) |
|
return text |
|
|
|
|
|
def txt_preprocess(text): |
|
text = normalize_text(text) |
|
text = stemming(text) |
|
text = remove_stopwords(text) |
|
text = remove_punctuations(text) |
|
text = Remove_unwanted(text) |
|
return text |
|
|