czl commited on
Commit
5330bda
·
verified ·
1 Parent(s): 1389b5a

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,36 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pt
README.md CHANGED
@@ -1,12 +1,14 @@
1
- ---
2
- title: Seq2Seq
3
- emoji: 📈
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.12.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
+ ---
2
+ title: Seq2Seq
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.12.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ python_version: 3.10
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import re
4
+ import unicodedata
5
+ from typing import Tuple
6
+
7
+ import gradio as gr
8
+ import spacy
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+ nlp = spacy.load('en_core_web_sm')
14
+
15
+ def greet(name):
16
+ return "Hello " + name + "!!"
17
+
18
+ # read word2idx and idx2word from json file
19
+
20
+ with open('vocab/word2idx.json', 'r') as f:
21
+ word2idx = json.load(f)
22
+ with open('vocab/idx2word.json', 'r') as f:
23
+ idx2word = json.load(f)
24
+
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ def unicodetoascii(text):
28
+ """
29
+ Turn a Unicode string to plain ASCII
30
+
31
+ :param text: text to be converted
32
+ :return: text in ascii format
33
+ """
34
+ normalized_text = unicodedata.normalize('NFKD', str(text))
35
+ ascii_text = ''.join(char for char in normalized_text if unicodedata.category(char) != 'Mn')
36
+ return ascii_text
37
+
38
+ def preprocess_text(text, fn=unicodetoascii):
39
+
40
+ text = fn(text)
41
+ text = text.lower()
42
+ text = re.sub(r'http\S+', '', text)
43
+ text = re.sub(r'[^\x00-\x7F]+', "", text) # Remove non-ASCII characters
44
+ text = re.sub(r"(\w)[!?]+(\w)", r'\1\2', text) # Remove !? between words
45
+ text = re.sub(r"\s\s+", r" ", text).strip() # Remove extra spaces
46
+ return text
47
+
48
+ def tokenize(text, nlp=nlp):
49
+ """
50
+ Tokenize text
51
+ :param text: text to be tokenized
52
+ :return: list of tokens
53
+ """
54
+ return [tok.text for tok in nlp.tokenizer(text)]
55
+
56
+ def lookup_words(idx2word, indices):
57
+ """
58
+ Lookup words from indices
59
+ :param idx2word: index to word mapping
60
+ :param indices: indices to be converted
61
+ :return: list of words
62
+ """
63
+ return [idx2word[str(idx)] for idx in indices]
64
+
65
+
66
+ class Encoder(nn.Module):
67
+ """
68
+ GRU RNN Encoder
69
+ """
70
+ def __init__(self,
71
+ input_dim: int,
72
+ emb_dim: int,
73
+ enc_hid_dim: int,
74
+ dec_hid_dim: int,
75
+ dropout: float = 0):
76
+ super(Encoder, self).__init__()
77
+
78
+ # dimension of imput
79
+ self.input_dim = input_dim
80
+ # dimension of embedding layer
81
+ self.emb_dim = emb_dim
82
+ # dimension of encoding hidden layer
83
+ self.enc_hid_dim = enc_hid_dim
84
+ # dimension of decoding hidden layer
85
+ self.dec_hid_dim = dec_hid_dim
86
+
87
+ # create embedding layer use to train embedding representations of the corpus
88
+ self.embedding = nn.Embedding(input_dim, emb_dim)
89
+
90
+ # use GRU for RNN
91
+ self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True, batch_first=False, num_layers=1)
92
+ self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
93
+ # create dropout layer which will help produce a more generalisable model
94
+ self.dropout = nn.Dropout(dropout)
95
+
96
+ def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
97
+ # apply dropout to the embedding layer
98
+ embedded = self.dropout(self.embedding(src))
99
+ # generate an output and hidden layer from the rnn
100
+ outputs, hidden = self.rnn(embedded)
101
+ hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
102
+ return outputs, hidden
103
+
104
+
105
+ class Attention(nn.Module):
106
+ """
107
+ Luong attention
108
+ """
109
+ def __init__(self,
110
+ enc_hid_dim: int,
111
+ dec_hid_dim: int,
112
+ attn_dim: int):
113
+ super(Attention, self).__init__()
114
+
115
+ # dimension of encoding hidden layer
116
+ self.enc_hid_dim = enc_hid_dim
117
+ # dimension of decoding hidden layer
118
+ self.dec_hid_dim = dec_hid_dim
119
+ self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
120
+
121
+ self.attn = nn.Linear(self.attn_in, attn_dim)
122
+
123
+ def forward(self,
124
+ decoder_hidden: torch.Tensor,
125
+ encoder_outputs: torch.Tensor) -> torch.Tensor:
126
+
127
+ src_len = encoder_outputs.shape[0]
128
+ repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
129
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
130
+ # Luong attention
131
+ energy = torch.tanh(self.attn(torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2)))
132
+ attention = torch.sum(energy, dim=2)
133
+
134
+ return F.softmax(attention, dim=1)
135
+
136
+
137
+ class AttnDecoder(nn.Module):
138
+ """
139
+ GRU RNN Decoder with attention
140
+ """
141
+ def __init__(self,
142
+ output_dim: int,
143
+ emb_dim: int,
144
+ enc_hid_dim: int,
145
+ dec_hid_dim: int,
146
+ attention: nn.Module,
147
+ dropout: float = 0):
148
+ super(AttnDecoder, self).__init__()
149
+
150
+ # dimention of output layer
151
+ self.output_dim = output_dim
152
+ # dimention of embedding layer
153
+ self.emb_dim = emb_dim
154
+ # dimention of encoding hidden layer
155
+ self.enc_hid_dim = enc_hid_dim
156
+ # dimention of decoding hidden layer
157
+ self.dec_hid_dim = dec_hid_dim
158
+ # drouput rate
159
+ self.dropout = dropout
160
+ # attention layer
161
+ self.attention = attention
162
+
163
+ # create embedding layer use to train embedding representations of the corpus
164
+ self.embedding = nn.Embedding(output_dim, emb_dim)
165
+ # use GRU for RNN
166
+ self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
167
+ self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
168
+ self.dropout = nn.Dropout(dropout)
169
+
170
+ def encode_attention(self,
171
+ decoder_hidden: torch.Tensor,
172
+ encoder_outputs: torch.Tensor) -> torch.Tensor:
173
+
174
+ a = self.attention(decoder_hidden, encoder_outputs)
175
+ a = a.unsqueeze(1)
176
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
177
+ weighted_encoder_rep = torch.bmm(a, encoder_outputs)
178
+ weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)
179
+ return weighted_encoder_rep
180
+
181
+ def forward(self,
182
+ input: torch.Tensor,
183
+ decoder_hidden: torch.Tensor,
184
+ encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
185
+
186
+ input = input.unsqueeze(0)
187
+ # apply dropout to embedding layer
188
+ embedded = self.dropout(self.embedding(input))
189
+ weighted_encoder = self.encode_attention(decoder_hidden, encoder_outputs)
190
+
191
+ # generate an output and hidden layer from the rnn
192
+ rnn_input = torch.cat((embedded, weighted_encoder), dim=2)
193
+ output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
194
+
195
+ embedded = embedded.squeeze(0)
196
+ output = output.squeeze(0)
197
+ weighted_encoder = weighted_encoder.squeeze(0)
198
+ output = self.out(torch.cat((output, weighted_encoder, embedded), dim=1))
199
+ return output, decoder_hidden.squeeze(0)
200
+
201
+ class Decoder(nn.Module):
202
+ """
203
+ GRU RNN Decoder without attention
204
+ """
205
+ def __init__(self,
206
+ output_dim: int,
207
+ emb_dim: int,
208
+ enc_hid_dim: int,
209
+ dec_hid_dim: int,
210
+ dropout: float = 0):
211
+ super(Decoder, self).__init__()
212
+
213
+ # dimention of output layer
214
+ self.output_dim = output_dim
215
+ # dimention of embedding layer
216
+ self.emb_dim = emb_dim
217
+ # dimention of encoding hidden layer
218
+ self.enc_hid_dim = enc_hid_dim
219
+ # dimention of decoding hidden layer
220
+ self.dec_hid_dim = dec_hid_dim
221
+ # drouput rate
222
+ self.dropout = dropout
223
+
224
+ # create embedding layer use to train embedding representations of the corpus
225
+ self.embedding = nn.Embedding(output_dim, emb_dim)
226
+ # GRU RNN
227
+ self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
228
+ self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
229
+ self.dropout = nn.Dropout(dropout)
230
+
231
+ def forward(self,
232
+ input: torch.Tensor,
233
+ decoder_hidden: torch.Tensor,
234
+ encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor
235
+ , torch.Tensor]:
236
+
237
+ input = input.unsqueeze(0)
238
+ # apply dropout to embedding layer
239
+ embedded = self.dropout(self.embedding(input))
240
+ context = encoder_outputs[-1,:,:]
241
+ context = context.repeat(embedded.shape[0], 1, 1)
242
+ embs_and_context = torch.cat((embedded, context), -1)
243
+ # generate an output and hidden layer from the rnn
244
+ output, decoder_hidden = self.rnn(embs_and_context, decoder_hidden.unsqueeze(0))
245
+ embedded = embedded.squeeze(0)
246
+ output = output.squeeze(0)
247
+ context = context.squeeze(0)
248
+ output = self.out(torch.cat((output, embedded, context), -1))
249
+ return output, decoder_hidden.squeeze(0)
250
+
251
+ class Seq2Seq(nn.Module):
252
+ """
253
+ Seq-2-Seq model combining RNN encoder and RNN decoder
254
+ """
255
+ def __init__(self,
256
+ encoder: nn.Module,
257
+ decoder: nn.Module,
258
+ device: torch.device):
259
+ super(Seq2Seq, self).__init__()
260
+
261
+ self.encoder = encoder
262
+ self.decoder = decoder
263
+ self.device = device
264
+
265
+ def forward(self,
266
+ src: torch.Tensor,
267
+ trg: torch.Tensor,
268
+ teacher_forcing_ratio: float = 0.5) -> torch.Tensor:
269
+ src = src.transpose(0, 1) # (max_len, batch_size)
270
+ trg = trg.transpose(0, 1) # (max_len, batch_size)
271
+ batch_size = src.shape[1]
272
+ max_len = trg.shape[0]
273
+ trg_vocab_size = self.decoder.output_dim
274
+
275
+ outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
276
+ encoder_outputs, hidden = self.encoder(src)
277
+
278
+ # first input to the decoder is the <sos> token
279
+ output = trg[0,:]
280
+
281
+ for t in range(1, max_len):
282
+ output, hidden = self.decoder(output, hidden, encoder_outputs)
283
+ outputs[t] = output
284
+ teacher_force = random.random() < teacher_forcing_ratio
285
+ top1 = output.max(1)[1]
286
+ output = trg[t] if teacher_force else top1
287
+
288
+ return outputs
289
+
290
+ params = {'input_dim': len(word2idx),
291
+ 'emb_dim': 128,
292
+ 'enc_hid_dim': 256,
293
+ 'dec_hid_dim': 256,
294
+ 'dropout': 0.5,
295
+ 'attn_dim': 32,
296
+ 'teacher_forcing_ratio': 0.5,
297
+ 'epochs': 35}
298
+
299
+ enc = Encoder(input_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
300
+ attn = Attention(enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], attn_dim=params['attn_dim'])
301
+ dec = AttnDecoder(output_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], attention=attn, dropout=params['dropout'])
302
+ attn_model = Seq2Seq(encoder=enc, decoder=dec, device=device)
303
+ attn_model.load_state_dict(torch.load('AttnSeq2Seq-188M_epoch35.pt', map_location=torch.device('cpu')))
304
+ attn_model.to(device)
305
+
306
+ enc = Encoder(input_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
307
+ dec = Decoder(output_dim=params['input_dim'], emb_dim=params['emb_dim'], enc_hid_dim=params['enc_hid_dim'], dec_hid_dim=params['dec_hid_dim'], dropout=params['dropout'])
308
+ norm_model = Seq2Seq(encoder=enc, decoder=dec, device=device)
309
+ norm_model.load_state_dict(torch.load('NormSeq2Seq-188M_epoch35.pt', map_location=torch.device('cpu')))
310
+ norm_model.to(device)
311
+
312
+ with open('vocab219/word2idx.json', 'r') as f:
313
+ word2idx2 = json.load(f)
314
+ with open('vocab219/idx2word.json', 'r') as f:
315
+ idx2word2 = json.load(f)
316
+
317
+ params219 = {'input_dim': len(word2idx2),
318
+ 'emb_dim': 192,
319
+ 'enc_hid_dim': 256,
320
+ 'dec_hid_dim': 256,
321
+ 'dropout': 0.5,
322
+ 'attn_dim': 64,
323
+ 'teacher_forcing_ratio': 0.5,
324
+ 'epochs': 35}
325
+
326
+ enc = Encoder(input_dim=params219['input_dim'], emb_dim=params219['emb_dim'],
327
+ enc_hid_dim=params219['enc_hid_dim'], dec_hid_dim=params219['dec_hid_dim'],
328
+ dropout=params219['dropout'])
329
+ attn = Attention(enc_hid_dim=params219['enc_hid_dim'], dec_hid_dim=params219['dec_hid_dim'],
330
+ attn_dim=params219['attn_dim'])
331
+ dec = AttnDecoder(output_dim=params219['input_dim'], emb_dim=params219['emb_dim'],
332
+ enc_hid_dim=params219['enc_hid_dim'], dec_hid_dim=params219['dec_hid_dim'],
333
+ attention=attn, dropout=params219['dropout'])
334
+ attn_model219 = Seq2Seq(encoder=enc, decoder=dec, device=device)
335
+ attn_model219.load_state_dict(torch.load('AttnSeq2Seq-219M_epoch35.pt',
336
+ map_location=torch.device('cpu')))
337
+ attn_model219.to(device)
338
+
339
+ enc = Encoder(input_dim=params219['input_dim'], emb_dim=params219['emb_dim'],
340
+ enc_hid_dim=params219['enc_hid_dim'],
341
+ dec_hid_dim=params219['dec_hid_dim'], dropout=params219['dropout'])
342
+ dec = Decoder(output_dim=params219['input_dim'], emb_dim=params219['emb_dim'],
343
+ enc_hid_dim=params219['enc_hid_dim'],
344
+ dec_hid_dim=params219['dec_hid_dim'],
345
+ dropout=params219['dropout'])
346
+ norm_model219 = Seq2Seq(encoder=enc, decoder=dec, device=device)
347
+ norm_model219.load_state_dict(torch.load('NormSeq2Seq-219M_epoch35.pt',
348
+ map_location=torch.device('cpu')))
349
+ norm_model219.to(device)
350
+
351
+ with open('vocab219SW/word2idx.json', 'r') as f:
352
+ word2idx3 = json.load(f)
353
+ with open('vocab219SW/idx2word.json', 'r') as f:
354
+ idx2word3 = json.load(f)
355
+
356
+ params219SW = {'input_dim': len(word2idx3),
357
+ 'emb_dim': 192,
358
+ 'enc_hid_dim': 256,
359
+ 'dec_hid_dim': 256,
360
+ 'dropout': 0.5,
361
+ 'attn_dim': 64,
362
+ 'teacher_forcing_ratio': 0.5,
363
+ 'epochs': 35}
364
+
365
+ enc = Encoder(input_dim=params219SW['input_dim'], emb_dim=params219SW['emb_dim'],
366
+ enc_hid_dim=params219SW['enc_hid_dim'], dec_hid_dim=params219SW['dec_hid_dim'],
367
+ dropout=params219SW['dropout'])
368
+ attn = Attention(enc_hid_dim=params219SW['enc_hid_dim'], dec_hid_dim=params219SW['dec_hid_dim'],
369
+ attn_dim=params219SW['attn_dim'])
370
+ dec = AttnDecoder(output_dim=params219SW['input_dim'], emb_dim=params219['emb_dim'],
371
+ enc_hid_dim=params219SW['enc_hid_dim'], dec_hid_dim=params219SW['dec_hid_dim'],
372
+ attention=attn, dropout=params219SW['dropout'])
373
+ attn_model219SW = Seq2Seq(encoder=enc, decoder=dec, device=device)
374
+ attn_model219SW.load_state_dict(torch.load('AttnSeq2Seq-219M-SW_epoch35.pt',
375
+ map_location=torch.device('cpu')))
376
+ attn_model219SW.to(device)
377
+
378
+ enc = Encoder(input_dim=params219SW['input_dim'], emb_dim=params219SW['emb_dim'],
379
+ enc_hid_dim=params219SW['enc_hid_dim'],
380
+ dec_hid_dim=params219SW['dec_hid_dim'], dropout=params219SW['dropout'])
381
+ dec = Decoder(output_dim=params219SW['input_dim'], emb_dim=params219SW['emb_dim'],
382
+ enc_hid_dim=params219SW['enc_hid_dim'],
383
+ dec_hid_dim=params219SW['dec_hid_dim'],
384
+ dropout=params219SW['dropout'])
385
+ norm_model219SW = Seq2Seq(encoder=enc, decoder=dec, device=device)
386
+ norm_model219SW.load_state_dict(torch.load('NormSeq2Seq-219M-SW_epoch35.pt',
387
+ map_location=torch.device('cpu')))
388
+ norm_model219SW.to(device)
389
+
390
+ nlp = spacy.load('en_core_web_sm')
391
+
392
+ models_dict = {'AttentionSeq2Seq-188M': attn_model, 'NormalSeq2Seq-188M': norm_model,
393
+ 'AttentionSeq2Seq-219M': attn_model219,
394
+ 'NormalSeq2Seq-219M': norm_model219,
395
+ 'AttentionSeq2Seq-219M-SW': attn_model219SW,
396
+ 'NormalSeq2Seq-219M-SW': norm_model219SW}
397
+
398
+ def generateAttn188(sentence, history, max_len=12,
399
+ word2idx=word2idx, idx2word=idx2word,
400
+ device=device, tokenize=tokenize, preprocess_text=preprocess_text,
401
+ lookup_words=lookup_words, models_dict=models_dict):
402
+ """
403
+ Generate response
404
+ :param model: model
405
+ :param sentence: sentence
406
+ :param max_len: maximum length of sequence
407
+ :param word2idx: word to index mapping
408
+ :param idx2word: index to word mapping
409
+ :return: response
410
+ """
411
+ history = history
412
+ model = models_dict['AttentionSeq2Seq-188M']
413
+ model.eval()
414
+ sentence = preprocess_text(sentence)
415
+ tokens = tokenize(sentence)
416
+ tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
417
+ tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
418
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
419
+ outputs = [word2idx['<bos>']]
420
+ with torch.no_grad():
421
+ encoder_outputs, hidden = model.encoder(tokens)
422
+ for t in range(max_len):
423
+ output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
424
+ top1 = output.max(1)[1]
425
+ outputs.append(top1.item())
426
+ if top1.item() == word2idx['<eos>']:
427
+ break
428
+ response = lookup_words(idx2word, outputs)
429
+ return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
430
+
431
+ def generateNorm188(sentence, history, max_len=12,
432
+ word2idx=word2idx, idx2word=idx2word,
433
+ device=device, tokenize=tokenize, preprocess_text=preprocess_text,
434
+ lookup_words=lookup_words, models_dict=models_dict):
435
+ """
436
+ Generate response
437
+ :param model: model
438
+ :param sentence: sentence
439
+ :param max_len: maximum length of sequence
440
+ :param word2idx: word to index mapping
441
+ :param idx2word: index to word mapping
442
+ :return: response
443
+ """
444
+ history = history
445
+ model = models_dict['NormalSeq2Seq-188M']
446
+ model.eval()
447
+ sentence = preprocess_text(sentence)
448
+ tokens = tokenize(sentence)
449
+ tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
450
+ tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
451
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
452
+ outputs = [word2idx['<bos>']]
453
+ with torch.no_grad():
454
+ encoder_outputs, hidden = model.encoder(tokens)
455
+ for t in range(max_len):
456
+ output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
457
+ top1 = output.max(1)[1]
458
+ outputs.append(top1.item())
459
+ if top1.item() == word2idx['<eos>']:
460
+ break
461
+ response = lookup_words(idx2word, outputs)
462
+ return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
463
+
464
+ def generateAttn219(sentence, history, max_len=12,
465
+ word2idx=word2idx2, idx2word=idx2word2,
466
+ device=device, tokenize=tokenize, preprocess_text=preprocess_text,
467
+ lookup_words=lookup_words, models_dict=models_dict):
468
+ """
469
+ Generate response
470
+ :param model: model
471
+ :param sentence: sentence
472
+ :param max_len: maximum length of sequence
473
+ :param word2idx: word to index mapping
474
+ :param idx2word: index to word mapping
475
+ :return: response
476
+ """
477
+ history = history
478
+ model = models_dict['AttentionSeq2Seq-219M']
479
+ model.eval()
480
+ sentence = preprocess_text(sentence)
481
+ tokens = tokenize(sentence)
482
+ tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
483
+ tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
484
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
485
+ outputs = [word2idx['<bos>']]
486
+ with torch.no_grad():
487
+ encoder_outputs, hidden = model.encoder(tokens)
488
+ for t in range(max_len):
489
+ output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
490
+ top1 = output.max(1)[1]
491
+ outputs.append(top1.item())
492
+ if top1.item() == word2idx['<eos>']:
493
+ break
494
+ response = lookup_words(idx2word, outputs)
495
+ return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
496
+
497
+ def generateNorm219(sentence, history, max_len=12,
498
+ word2idx=word2idx2, idx2word=idx2word2,
499
+ device=device, tokenize=tokenize, preprocess_text=preprocess_text,
500
+ lookup_words=lookup_words, models_dict=models_dict):
501
+ """
502
+ Generate response
503
+ :param model: model
504
+ :param sentence: sentence
505
+ :param max_len: maximum length of sequence
506
+ :param word2idx: word to index mapping
507
+ :param idx2word: index to word mapping
508
+ :return: response
509
+ """
510
+ history = history
511
+ model = models_dict['NormalSeq2Seq-219M']
512
+ model.eval()
513
+ sentence = preprocess_text(sentence)
514
+ tokens = tokenize(sentence)
515
+ tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
516
+ tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
517
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
518
+ outputs = [word2idx['<bos>']]
519
+ with torch.no_grad():
520
+ encoder_outputs, hidden = model.encoder(tokens)
521
+ for t in range(max_len):
522
+ output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
523
+ top1 = output.max(1)[1]
524
+ outputs.append(top1.item())
525
+ if top1.item() == word2idx['<eos>']:
526
+ break
527
+ response = lookup_words(idx2word, outputs)
528
+ return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
529
+
530
+ def tokenize_context(text, nlp=nlp):
531
+ """
532
+ Tokenize text and remove stop words
533
+ :param text: text to be tokenized
534
+ :return: list of tokens
535
+ """
536
+ return [tok.text for tok in nlp.tokenizer(text) if not tok.is_stop]
537
+
538
+ def generateAttn219SW(sentence, history, max_len=12,
539
+ word2idx=word2idx3, idx2word=idx2word3,
540
+ device=device, tokenize_context=tokenize_context,
541
+ preprocess_text=preprocess_text,
542
+ lookup_words=lookup_words, models_dict=models_dict):
543
+ """
544
+ Generate response
545
+ :param model: model
546
+ :param sentence: sentence
547
+ :param max_len: maximum length of sequence
548
+ :param word2idx: word to index mapping
549
+ :param idx2word: index to word mapping
550
+ :return: response
551
+ """
552
+ history = history
553
+ model = models_dict['AttentionSeq2Seq-219M']
554
+ model.eval()
555
+ sentence = preprocess_text(sentence)
556
+ tokens = tokenize_context(sentence)
557
+ tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
558
+ tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
559
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
560
+ outputs = [word2idx['<bos>']]
561
+ with torch.no_grad():
562
+ encoder_outputs, hidden = model.encoder(tokens)
563
+ for t in range(max_len):
564
+ output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
565
+ top1 = output.max(1)[1]
566
+ outputs.append(top1.item())
567
+ if top1.item() == word2idx['<eos>']:
568
+ break
569
+ response = lookup_words(idx2word, outputs)
570
+ return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
571
+
572
+ def generateNorm219SW(sentence, history, max_len=12,
573
+ word2idx=word2idx3, idx2word=idx2word3,
574
+ device=device, tokenize_context=tokenize_context, preprocess_text=preprocess_text,
575
+ lookup_words=lookup_words, models_dict=models_dict):
576
+ """
577
+ Generate response
578
+ :param model: model
579
+ :param sentence: sentence
580
+ :param max_len: maximum length of sequence
581
+ :param word2idx: word to index mapping
582
+ :param idx2word: index to word mapping
583
+ :return: response
584
+ """
585
+ history = history
586
+ model = models_dict['NormalSeq2Seq-219M']
587
+ model.eval()
588
+ sentence = preprocess_text(sentence)
589
+ tokens = tokenize_context(sentence)
590
+ tokens = [word2idx[token] if token in word2idx else word2idx['<unk>'] for token in tokens]
591
+ tokens = [word2idx['<bos>']] + tokens + [word2idx['<eos>']]
592
+ tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(1).to(device)
593
+ outputs = [word2idx['<bos>']]
594
+ with torch.no_grad():
595
+ encoder_outputs, hidden = model.encoder(tokens)
596
+ for t in range(max_len):
597
+ output, hidden = model.decoder(torch.tensor([outputs[-1]], dtype=torch.long).to(device), hidden, encoder_outputs)
598
+ top1 = output.max(1)[1]
599
+ outputs.append(top1.item())
600
+ if top1.item() == word2idx['<eos>']:
601
+ break
602
+ response = lookup_words(idx2word, outputs)
603
+ return ' '.join(response).replace('<bos>', '').replace('<eos>', '').strip()
604
+
605
+ norm188 = gr.ChatInterface(generateNorm188,
606
+ title="NormalSeq2Seq-188M",
607
+ description="""Seq2Seq Generative Chatbot without Attention.
608
+
609
+ 188,204,500 trainable parameters""")
610
+ norm219 = gr.ChatInterface(generateNorm219,
611
+ title="NormalSeq2Seq-219M",
612
+ description="""Seq2Seq Generative Chatbot without Attention.
613
+
614
+ 219,456,724 trainable parameters""")
615
+ norm219sw = gr.ChatInterface(generateNorm219SW,
616
+ title="NormalSeq2Seq-219M-SW",
617
+ description="""Seq2Seq Generative Chatbot without Attention.
618
+
619
+ 219,451,344 trainable parameters
620
+
621
+ Trained with stop words removed for context (input) and more data.""")
622
+
623
+ attn188 = gr.ChatInterface(generateAttn188,
624
+ title="AttentionSeq2Seq-188M",
625
+ description="""Seq2Seq Generative Chatbot with Attention.
626
+
627
+ 188,229,108 trainable parameters""")
628
+ attn219 = gr.ChatInterface(generateAttn219,
629
+ title="AttentionSeq2Seq-219M",
630
+ description="""Seq2Seq Generative Chatbot with Attention.
631
+
632
+ 219,505,940 trainable parameters
633
+ """)
634
+ attn219sw = gr.ChatInterface(generateAttn219SW,
635
+ title="AttentionSeq2Seq-219M-SW",
636
+ description="""Seq2Seq Generative Chatbot with Attention.
637
+
638
+ 219,500,560 trainable parameters
639
+
640
+ Trained with stop words removed for context (input) and more data""")
641
+
642
+ with gr.Blocks() as demo:
643
+ gr.Markdown(""" > This chatbot is created as part of the Group Project Practical Assessment for University of Liverpool's CSCK507 Natural Language Processing and Understanding (June 2023)
644
+
645
+ > Disclaimer: Please be advised that this chatbot is an AI language model designed to generate responses based on patterns in data it has been trained on (Ubuntu Dialogue Dataset).
646
+ While efforts have been made to ensure that the responses generated are appropriate and respectful, there is a possibility that the chatbot may occasionally produce content that could be offensive, vulgar, or inappropriate.""")
647
+ gr.TabbedInterface([norm188, norm219, norm219sw], ["188M", "219M", "219M-SW"])
648
+ gr.TabbedInterface([attn188, attn219, attn219sw], ["188M", "219M", "219M-SW"])
649
+
650
+ if __name__ == "__main__":
651
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ numpy<1.23
3
+ setuptools<60.0
4
+ pandas
5
+ requests
6
+ spacy
7
+ torch
8
+ torchtext
9
+ nltk
10
+ sentence-transformers
11
+ scipy
12
+ en-core-web-sm @ https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
vocab/idx2word.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/word2idx.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab219/idx2word.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab219/word2idx.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab219SW/idx2word.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab219SW/word2idx.json ADDED
The diff for this file is too large to render. See raw diff