zArabi commited on
Commit
3238f2f
1 Parent(s): 1a8dfbe

Update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +73 -0
preprocessing.py CHANGED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hazm
2
+ from cleantext import clean
3
+ import regex as re
4
+
5
+ def cleanhtml(raw_html):
6
+ cleanr = re.compile('<.*?>')
7
+ cleantext = re.sub(cleanr, '', raw_html)
8
+ return cleantext
9
+
10
+ def cleaning(text):
11
+ text = text.strip()
12
+
13
+ # regular cleaning
14
+ # https://pypi.org/project/clean-text/ >> works well for eng and de languages
15
+ text = clean(text,
16
+ fix_unicode=True,
17
+ to_ascii=False,
18
+ lower=True,
19
+ no_line_breaks=True,
20
+ no_urls=True,
21
+ no_emails=True,
22
+ no_phone_numbers=True,
23
+ no_numbers=False,
24
+ no_digits=False,
25
+ no_currency_symbols=True,
26
+ no_punct=False, #Keep the punc
27
+ replace_with_url="",
28
+ replace_with_email="",
29
+ replace_with_phone_number="",
30
+ replace_with_number="",
31
+ replace_with_digit="0",
32
+ replace_with_currency_symbol="",
33
+ )
34
+
35
+ # cleaning htmls
36
+ text = cleanhtml(text)
37
+
38
+ # normalizing > https://github.com/sobhe/hazm
39
+ normalizer = hazm.Normalizer()
40
+ text = normalizer.normalize(text)
41
+
42
+ # removing wierd patterns
43
+ wierd_pattern = re.compile("["
44
+ u"\U0001F600-\U0001F64F" # emoticons
45
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
46
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
47
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
48
+ u"\U00002702-\U000027B0"
49
+ u"\U000024C2-\U0001F251"
50
+ u"\U0001f926-\U0001f937"
51
+ u'\U00010000-\U0010ffff'
52
+ u"\u200d"
53
+ u"\u2640-\u2642"
54
+ u"\u2600-\u2B55"
55
+ u"\u23cf"
56
+ u"\u23e9"
57
+ u"\u231a"
58
+ u"\u3030"
59
+ u"\ufe0f"
60
+ u"\u2069"
61
+ u"\u2066"
62
+ # u"\u200c"
63
+ u"\u2068"
64
+ u"\u2067"
65
+ "]+", flags=re.UNICODE)
66
+
67
+ text = wierd_pattern.sub(r'', text)
68
+
69
+ # removing extra spaces, hashtags
70
+ text = re.sub("#", "", text)
71
+ text = re.sub("\s+", " ", text)
72
+
73
+ return text