BhuiyanMasum
commited on
Commit
·
ccfa333
1
Parent(s):
e13f31a
Updated data.py
Browse files- data/dataset.txt +0 -0
- src/pipes/const.py +2 -2
- src/pipes/data.py +122 -58
- src/pipes/utils.py +0 -27
data/dataset.txt
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/pipes/const.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
data_dir: str = "E:/bn_multi_tribe_mt/data/"
|
2 |
-
|
3 |
-
|
|
|
1 |
data_dir: str = "E:/bn_multi_tribe_mt/data/"
|
2 |
+
langs: list[str] = ['bn', 'en', 'gr']
|
3 |
+
MAX_SEQ_LEN = 30
|
src/pipes/data.py
CHANGED
@@ -1,78 +1,137 @@
|
|
1 |
import random
|
2 |
-
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
class Sentence:
|
6 |
def __init__(self):
|
7 |
-
self.
|
8 |
-
self.
|
9 |
self.shuffled_indices = None
|
10 |
-
self.
|
11 |
self.max_seq_length = None
|
12 |
self.vocab = None
|
|
|
13 |
|
14 |
-
def pack(self
|
15 |
-
self.
|
16 |
|
17 |
-
|
18 |
-
split_index = int(
|
19 |
|
20 |
if self.shuffled_indices is None:
|
21 |
-
self.shuffled_indices = list(range(
|
22 |
random.shuffle(self.shuffled_indices)
|
23 |
|
24 |
-
self.
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
self.vocab = utils.build_vocab(self.shuffled_sentences)
|
31 |
-
self.max_seq_length = max(len(sentence.split()) for sentence in self.shuffled_sentences)
|
32 |
-
self.sentence_dict = dict(
|
33 |
-
max_seq_len=self.max_seq_length,
|
34 |
-
vocab_size=len(self.vocab),
|
35 |
-
vocab=self.vocab,
|
36 |
-
train=self.shuffled_sentences[:split_index],
|
37 |
-
val=self.shuffled_sentences[split_index:],
|
38 |
-
count=example_count
|
39 |
)
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def tokenize(self):
|
42 |
-
|
43 |
-
|
44 |
tokens = []
|
45 |
-
for word in
|
46 |
tokens.append(self.vocab.index(word))
|
47 |
-
|
48 |
|
49 |
-
|
50 |
-
|
51 |
tokens = []
|
52 |
-
for word in
|
53 |
tokens.append(self.vocab.index(word))
|
54 |
-
|
55 |
-
|
56 |
-
self.sentence_dict["train"] = tokenized_train_sentences
|
57 |
-
self.sentence_dict["val"] = tokenized_val_sentences
|
58 |
|
59 |
-
def pad(self, max_seq_len=
|
60 |
-
if max_seq_len is None:
|
61 |
-
max_seq_len = self.sentence_dict["max_seq_len"]
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
|
71 |
-
|
72 |
-
self.
|
|
|
73 |
|
74 |
def get_dict(self):
|
75 |
-
return self.
|
76 |
|
77 |
|
78 |
class Dataset:
|
@@ -81,18 +140,22 @@ class Dataset:
|
|
81 |
self.dataset_dict = {}
|
82 |
|
83 |
def pack(self):
|
84 |
-
|
85 |
for lang in self.langs:
|
86 |
-
|
87 |
-
|
|
|
88 |
|
89 |
def process(self):
|
90 |
-
|
91 |
for lang in self.langs:
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
96 |
|
97 |
def get_dict(self):
|
98 |
return self.dataset_dict
|
@@ -101,6 +164,7 @@ class Dataset:
|
|
101 |
if __name__ == "__main__":
|
102 |
dataset_object = Dataset(const.langs)
|
103 |
dataset_object.pack()
|
104 |
-
dataset_object.process()
|
105 |
dataset_dict = dataset_object.get_dict()
|
106 |
utils.save_dict("{}/dataset.txt".format(const.data_dir), dataset_dict)
|
|
|
|
|
|
1 |
import random
|
2 |
+
import const
|
3 |
+
import utils
|
4 |
+
import string
|
5 |
|
6 |
+
class SequenceLoader:
|
|
|
7 |
def __init__(self):
|
8 |
+
self.sequence_dict = None
|
9 |
+
self.shuffled_sequences = None
|
10 |
self.shuffled_indices = None
|
11 |
+
self.sequences = None
|
12 |
self.max_seq_length = None
|
13 |
self.vocab = None
|
14 |
+
self.lang = None
|
15 |
|
16 |
+
def pack(self):
|
17 |
+
self.sequences = utils.read_file("{}/raw/{}.txt".format(const.data_dir, self.lang))
|
18 |
|
19 |
+
examples_count = len(self.sequences)
|
20 |
+
split_index = int(examples_count * 0.80)
|
21 |
|
22 |
if self.shuffled_indices is None:
|
23 |
+
self.shuffled_indices = list(range(examples_count))
|
24 |
random.shuffle(self.shuffled_indices)
|
25 |
|
26 |
+
self.shuffled_sequences = [self.sequences[i] for i in self.shuffled_indices]
|
27 |
+
|
28 |
+
self.sequence_dict = dict(
|
29 |
+
train=self.shuffled_sequences[:split_index],
|
30 |
+
val=self.shuffled_sequences[split_index:],
|
31 |
+
count=examples_count,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
)
|
33 |
|
34 |
+
def get_dict(self):
|
35 |
+
return self.sequence_dict
|
36 |
+
|
37 |
+
def set_lang(self, lang):
|
38 |
+
self.lang = lang
|
39 |
+
|
40 |
+
|
41 |
+
def remove_punctuation_from_seq(seq):
|
42 |
+
english_punctuations = string.punctuation
|
43 |
+
bangla_punctuations = "৷-–—’‘৳…।"
|
44 |
+
all_punctuations = english_punctuations + bangla_punctuations
|
45 |
+
cleaned_seq = ''.join([char for char in seq if char not in all_punctuations])
|
46 |
+
cleaned_seq = cleaned_seq.strip()
|
47 |
+
cleaned_seq = ' '.join(cleaned_seq.split())
|
48 |
+
return cleaned_seq
|
49 |
+
|
50 |
+
|
51 |
+
def add_start_end_tags_seq(sequence):
|
52 |
+
return '<SOS> ' + sequence + ' <EOS>'
|
53 |
+
|
54 |
+
|
55 |
+
def pad_sequence(sequence, max_seq_len, padding_token=0):
|
56 |
+
padded_sequence = sequence[:max_seq_len] + [padding_token] * (max_seq_len - len(sequence))
|
57 |
+
return padded_sequence
|
58 |
+
|
59 |
+
|
60 |
+
class SequenceProcessor:
|
61 |
+
def __init__(self, _dataset_dict):
|
62 |
+
self.max_seq_len = 0
|
63 |
+
self.lang = None
|
64 |
+
self.dataset_dict = _dataset_dict
|
65 |
+
self.vocab = None
|
66 |
+
|
67 |
+
def remove_punctuation(self):
|
68 |
+
for i in range(len(self.dataset_dict[self.lang]["train"])):
|
69 |
+
self.dataset_dict[self.lang]["train"][i] = remove_punctuation_from_seq(
|
70 |
+
self.dataset_dict[self.lang]["train"][i])
|
71 |
+
|
72 |
+
for i in range(len(self.dataset_dict[self.lang]["val"])):
|
73 |
+
self.dataset_dict[self.lang]["val"][i] = remove_punctuation_from_seq(
|
74 |
+
self.dataset_dict[self.lang]["val"][i])
|
75 |
+
|
76 |
+
def build_vocab(self):
|
77 |
+
vocab = set()
|
78 |
+
|
79 |
+
for i in range(len(self.dataset_dict[self.lang]["train"])):
|
80 |
+
seq = self.dataset_dict[self.lang]["train"][i]
|
81 |
+
vocab.update(seq.split())
|
82 |
+
|
83 |
+
for i in range(len(self.dataset_dict[self.lang]["val"])):
|
84 |
+
seq = self.dataset_dict[self.lang]["val"][i]
|
85 |
+
vocab.update(seq.split())
|
86 |
+
|
87 |
+
self.vocab = sorted(list(vocab))
|
88 |
+
self.dataset_dict[self.lang]["vocab"] = self.vocab
|
89 |
+
self.dataset_dict[self.lang]["vocab_size"] = len(self.vocab)
|
90 |
+
|
91 |
+
def add_start_end_tags(self):
|
92 |
+
for i in range(len(self.dataset_dict[self.lang]["train"])):
|
93 |
+
self.dataset_dict[self.lang]["train"][i] = add_start_end_tags_seq(
|
94 |
+
self.dataset_dict[self.lang]["train"][i])
|
95 |
+
self.max_seq_len = max(len(self.dataset_dict[self.lang]["train"][i].split()), self.max_seq_len)
|
96 |
+
|
97 |
+
for i in range(len(self.dataset_dict[self.lang]["val"])):
|
98 |
+
self.dataset_dict[self.lang]["val"][i] = add_start_end_tags_seq(
|
99 |
+
self.dataset_dict[self.lang]["val"][i])
|
100 |
+
self.max_seq_len = max(len(self.dataset_dict[self.lang]["val"][i].split()), self.max_seq_len)
|
101 |
+
|
102 |
+
self.dataset_dict[self.lang]["max_seq_len"] = self.max_seq_len
|
103 |
+
|
104 |
def tokenize(self):
|
105 |
+
for i in range(len(self.dataset_dict[self.lang]["train"])):
|
106 |
+
seq = self.dataset_dict[self.lang]["train"][i]
|
107 |
tokens = []
|
108 |
+
for word in seq.split():
|
109 |
tokens.append(self.vocab.index(word))
|
110 |
+
self.dataset_dict[self.lang]["train"][i] = tokens
|
111 |
|
112 |
+
for i in range(len(self.dataset_dict[self.lang]["val"])):
|
113 |
+
seq = self.dataset_dict[self.lang]["val"][i]
|
114 |
tokens = []
|
115 |
+
for word in seq.split():
|
116 |
tokens.append(self.vocab.index(word))
|
117 |
+
self.dataset_dict[self.lang]["val"][i] = tokens
|
|
|
|
|
|
|
118 |
|
119 |
+
def pad(self, max_seq_len=const.MAX_SEQ_LEN):
|
|
|
|
|
120 |
|
121 |
+
for i in range(len(self.dataset_dict[self.lang]["train"])):
|
122 |
+
self.dataset_dict[self.lang]["train"][i] = pad_sequence(
|
123 |
+
sequence=self.dataset_dict[self.lang]["train"][i], max_seq_len=max_seq_len)
|
124 |
|
125 |
+
for i in range(len(self.dataset_dict[self.lang]["val"])):
|
126 |
+
self.dataset_dict[self.lang]["val"][i] = pad_sequence(sequence=self.dataset_dict[self.lang]["val"][i],
|
127 |
+
max_seq_len=self.max_seq_len)
|
128 |
|
129 |
+
def set_lang(self, lang):
|
130 |
+
self.lang = lang
|
131 |
+
self.max_seq_len = 0
|
132 |
|
133 |
def get_dict(self):
|
134 |
+
return self.dataset_dict
|
135 |
|
136 |
|
137 |
class Dataset:
|
|
|
140 |
self.dataset_dict = {}
|
141 |
|
142 |
def pack(self):
|
143 |
+
seq_loader = SequenceLoader()
|
144 |
for lang in self.langs:
|
145 |
+
seq_loader.set_lang(lang)
|
146 |
+
seq_loader.pack()
|
147 |
+
self.dataset_dict[lang] = seq_loader.get_dict()
|
148 |
|
149 |
def process(self):
|
150 |
+
seq_processor = SequenceProcessor(self.dataset_dict)
|
151 |
for lang in self.langs:
|
152 |
+
seq_processor.set_lang(lang)
|
153 |
+
seq_processor.remove_punctuation()
|
154 |
+
seq_processor.add_start_end_tags()
|
155 |
+
seq_processor.build_vocab()
|
156 |
+
seq_processor.tokenize()
|
157 |
+
seq_processor.pad()
|
158 |
+
self.dataset_dict = seq_processor.get_dict()
|
159 |
|
160 |
def get_dict(self):
|
161 |
return self.dataset_dict
|
|
|
164 |
if __name__ == "__main__":
|
165 |
dataset_object = Dataset(const.langs)
|
166 |
dataset_object.pack()
|
|
|
167 |
dataset_dict = dataset_object.get_dict()
|
168 |
utils.save_dict("{}/dataset.txt".format(const.data_dir), dataset_dict)
|
169 |
+
dataset_object.process()
|
170 |
+
print(utils.load_dict("{}/dataset.txt".format(const.data_dir)))
|
src/pipes/utils.py
CHANGED
@@ -2,15 +2,6 @@ import json
|
|
2 |
import string
|
3 |
|
4 |
|
5 |
-
def pad_sequence(sequence, max_length, padding_token=0):
|
6 |
-
padded_sequence = sequence[:max_length] + [padding_token] * (max_length - len(sequence))
|
7 |
-
return padded_sequence
|
8 |
-
|
9 |
-
|
10 |
-
def add_start_end_tags(sentence):
|
11 |
-
return '<START> ' + sentence + ' <END>'
|
12 |
-
|
13 |
-
|
14 |
def save_dict(file_path, data_dict, encoding='utf-8'):
|
15 |
with open(file_path, "w", encoding=encoding) as f:
|
16 |
json.dump(data_dict, f, ensure_ascii=False)
|
@@ -26,21 +17,3 @@ def read_file(file_path):
|
|
26 |
with open(file_path, "r", encoding="utf-8") as f:
|
27 |
sentences = f.readlines()
|
28 |
return sentences
|
29 |
-
|
30 |
-
|
31 |
-
def build_vocab(sentences):
|
32 |
-
vocab = set()
|
33 |
-
for sentence in sentences:
|
34 |
-
vocab.update(sentence.split())
|
35 |
-
return sorted(list(vocab))
|
36 |
-
|
37 |
-
|
38 |
-
def remove_punctuation(sentence):
|
39 |
-
english_punctuations = string.punctuation
|
40 |
-
bangla_punctuations = "৷-–—’‘৳…।"
|
41 |
-
all_punctuations = english_punctuations + bangla_punctuations
|
42 |
-
cleaned_sentence = ''.join([char for char in sentence if char not in all_punctuations])
|
43 |
-
cleaned_sentence = cleaned_sentence.strip()
|
44 |
-
cleaned_sentence = ' '.join(cleaned_sentence.split())
|
45 |
-
return cleaned_sentence
|
46 |
-
|
|
|
2 |
import string
|
3 |
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def save_dict(file_path, data_dict, encoding='utf-8'):
|
6 |
with open(file_path, "w", encoding=encoding) as f:
|
7 |
json.dump(data_dict, f, ensure_ascii=False)
|
|
|
17 |
with open(file_path, "r", encoding="utf-8") as f:
|
18 |
sentences = f.readlines()
|
19 |
return sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|