JLTastet commited on
Commit
e89883a
1 Parent(s): 3f451df

Add script used to clean the dataset

Browse files
Files changed (1) hide show
  1. mrclean.py +95 -0
mrclean.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # START_TOKEN = '<s>'
4
+ # END_TOKEN = '</s>'
5
+ # PADDING_TOKEN = '<pad>'
6
+
7
+ START_TOKEN = ''
8
+ END_TOKEN = ''
9
+ PADDING_TOKEN = ''
10
+
11
+ def _make_padding_sequence(seq_length):
12
+ return ''.join([END_TOKEN] + seq_length * [PADDING_TOKEN])
13
+
14
+ def cleanup_simple_wikipedia(text, seq_length):
15
+ pad_seq = _make_padding_sequence(seq_length)
16
+ text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq
17
+ return text
18
+
19
+ def cleanup_wikipedia(text, seq_length):
20
+ pad_seq = _make_padding_sequence(seq_length)
21
+ text = re.sub(r'= = = (.+?) = = =\n', r'\1', text)
22
+ lines = [line.strip() for line in text.splitlines()]
23
+ text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, '\n'.join(lines)[1:]) + pad_seq
24
+ return text
25
+
26
+ def cleanup_qed(text, seq_length):
27
+ # TODO: this should probably be padded too, but it’s difficult to detect when subtitles start and end
28
+ # The handling of proper nouns and of parentheses isn’t perfect, but this is still an improvement over the base text
29
+ punctuation_ex = re.compile(r'([.!?]\s*)')
30
+ unimportant_chars_ex = re.compile(r'\(.*?\)|[.!?]')
31
+ lines = []
32
+ for line in text.splitlines():
33
+ nchars = len(line)
34
+ if nchars > 0:
35
+ line_body = unimportant_chars_ex.sub('', line)
36
+ f_upper = sum(c.isupper() for c in line_body) / len(line_body)
37
+ if f_upper >= 0.5: # Mostly uppercase characters
38
+ # Taken from https://stackoverflow.com/a/41662260
39
+ split_on_punctuation = punctuation_ex.split(line.replace('l', 'I'))
40
+ line = ''.join([sentence.capitalize() for sentence in split_on_punctuation])
41
+ lines.append(line.strip())
42
+ return START_TOKEN + '\n'.join(lines) + END_TOKEN + ''.join(seq_length * [PADDING_TOKEN])
43
+
44
+ def cleanup_extra_spaces(text):
45
+ multiple_spaces_ex = re.compile(r'[ \t\u00A0]+')
46
+ space_before_punctuation_ex = re.compile(r'[ \t\u00A0]([.,;!?])')
47
+ text = multiple_spaces_ex.sub(' ', text)
48
+ text = space_before_punctuation_ex.sub(r'\1', text)
49
+ return text
50
+
51
+ def cleanup_bnc_spoken(text, seq_length):
52
+ pad_seq = _make_padding_sequence(seq_length)
53
+ text = cleanup_extra_spaces(text)
54
+ text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq
55
+ return text
56
+
57
+ def cleanup_aochildes(text, seq_length):
58
+ text = cleanup_extra_spaces(text)
59
+ return START_TOKEN + text + _make_padding_sequence(seq_length)
60
+
61
+ def cleanup_cbt(text, seq_length):
62
+ text = cleanup_extra_spaces(text)
63
+ space_before_apostroph = re.compile(r"([\w\d])[ \t\u00A0](['’]\w)")
64
+ #space_before_quote = re.compile(r"[ \t\u00A0](['’])")
65
+ #space_after_quote = re.compile(r"([`])[ \t\u00A0]")
66
+ #text = space_before_quote.sub(r'\1', text)
67
+ #text = space_after_quote.sub(r'\1', text)
68
+ text = space_before_apostroph.sub(r'\1\2', text)
69
+ return START_TOKEN + text + _make_padding_sequence(seq_length)
70
+
71
+ def cleanup_children_stories(text, seq_length):
72
+ # Sometimes one skipped line marks the beginning of a new story,
73
+ # but sometimes it is present within a same story, which doesn’t
74
+ # make it very useful for separating independent stories.
75
+ return START_TOKEN + text + _make_padding_sequence(seq_length)
76
+
77
+ def cleanup_gutenberg(text, seq_length):
78
+ # Overall, the text is clean, however some entries don’t seem
79
+ # very useful, e.g. figure captions preceded by a number.
80
+ # Not sure if we should remove them, because that would also
81
+ # remove bullet lists which are otherwise consistent with the
82
+ # surrounding text.
83
+ # No start or end tokens because the text seems to be cut.
84
+ return text + ''.join(seq_length * [PADDING_TOKEN])
85
+
86
+ def cleanup_open_subtitles(text, seq_length):
87
+ # The text is mostly clean, apart from some subtitle credits
88
+ # such as "Subtitles by ...".
89
+ subtitle_credit_ex = re.compile(r'^.*subtitle.*$\n', re.MULTILINE | re.IGNORECASE)
90
+ text = subtitle_credit_ex.sub('', text)
91
+ return START_TOKEN + text + _make_padding_sequence(seq_length)
92
+
93
+ def cleanup_switchboard(text, seq_length):
94
+ # No start or end tokens because the text seems to be cut.
95
+ return text + ''.join(seq_length * [PADDING_TOKEN])