Iulian277 commited on
Commit
251b871
·
1 Parent(s): d99c0ad

Create normalize.py

Browse files
Files changed (1) hide show
  1. normalize.py +44 -0
normalize.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import emoji
3
+
4
+ def normalize(sentence: str) -> str:
5
+ """
6
+ This function should be used before tokenizing the input string.
7
+
8
+ Normalizes the input string in the following ways:
9
+ -> Converts from ş to ș, ţ to ț, etc.
10
+ -> Converts @mention to USER, #hashtag to HASHTAG, http... and www... to HTTPURL
11
+ -> Converts emoticons to :emoji_with_long_name:
12
+ -> Replaces :emoji_with_long_name: with emoji_with_long_name and replaces _, : and - with empty string
13
+ -> Removes multiple whitespaces with a single whitespace
14
+ Args:
15
+ `sentence` (str): The input string.
16
+ Returns:
17
+ str: The normalized string.
18
+ """
19
+
20
+ # Make sure it's a string
21
+ sentence = str(sentence)
22
+
23
+ # Convert from ş to ș, ţ to ț, etc.
24
+ sentence = re.sub(r'ş', 'ș', sentence)
25
+ sentence = re.sub(r'Ş', 'Ș', sentence)
26
+ sentence = re.sub(r'ţ', 'ț', sentence)
27
+ sentence = re.sub(r'Ţ', 'Ț', sentence)
28
+
29
+ # Convert @mentions to USER, #hashtags to HASHTAG, http... and www... to HTTPURL
30
+ sentence = re.sub(r'@\S+', 'USER', sentence)
31
+ sentence = re.sub(r'#\S+', 'HASHTAG', sentence)
32
+ sentence = re.sub(r'http\S+', 'HTTPURL', sentence)
33
+ sentence = re.sub(r'www\S+', 'HTTPURL', sentence)
34
+
35
+ # Convert emoticons to :emoji_with_long_name:
36
+ sentence = emoji.demojize(sentence, delimiters=(' :', ': '))
37
+
38
+ # Replace :emoji_with_long_name: with emojiwithlongname
39
+ sentence = re.sub(r':\S+:', lambda x: x.group(0).replace('_', '').replace(':', '').replace('-', ''), sentence)
40
+
41
+ # Remove multiple whitespaces with a single whitespace
42
+ sentence = re.sub(r'\s+', ' ', sentence)
43
+
44
+ return sentence