thak123 commited on
Commit
7d76b6a
·
1 Parent(s): 819c669

Update dataset.py

Browse files
Files changed (1) hide show
  1. dataset.py +40 -0
dataset.py CHANGED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import config
2
+ import torch
3
+
4
+
5
+ class BERTDataset:
6
+ def __init__(self, review, target):
7
+ self.review = review
8
+ self.target = target
9
+ self.tokenizer = config.TOKENIZER
10
+ self.max_len = config.MAX_LEN
11
+
12
+ def __len__(self):
13
+ return len(self.review)
14
+
15
+ def __getitem__(self, item):
16
+ review = str(self.review[item])
17
+ review = " ".join(review.split())
18
+
19
+ inputs = self.tokenizer.encode_plus(
20
+ review,
21
+ None,
22
+ add_special_tokens=True,
23
+ max_length=self.max_len
24
+ )
25
+
26
+ ids = inputs["input_ids"]
27
+ mask = inputs["attention_mask"]
28
+ token_type_ids = inputs["token_type_ids"]
29
+
30
+ padding_length = self.max_len - len(ids)
31
+ ids = ids + ([0] * padding_length)
32
+ mask = mask + ([0] * padding_length)
33
+ token_type_ids = token_type_ids + ([0] * padding_length)
34
+
35
+ return {
36
+ 'ids': torch.tensor(ids, dtype=torch.long),
37
+ 'mask': torch.tensor(mask, dtype=torch.long),
38
+ 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
39
+ 'targets': torch.tensor(self.target[item], dtype=torch.float)
40
+ }