|
import config |
|
import torch |
|
|
|
|
|
class BERTDataset: |
|
def __init__(self, review, target): |
|
self.review = review |
|
self.target = target |
|
self.tokenizer = config.TOKENIZER |
|
self.max_len = config.MAX_LEN |
|
|
|
def __len__(self): |
|
return len(self.review) |
|
|
|
def __getitem__(self, item): |
|
review = str(self.review[item]) |
|
review = " ".join(review.split()) |
|
|
|
inputs = self.tokenizer.encode_plus( |
|
review, |
|
None, |
|
add_special_tokens=True, |
|
max_length=self.max_len |
|
) |
|
|
|
ids = inputs["input_ids"] |
|
mask = inputs["attention_mask"] |
|
token_type_ids = inputs["token_type_ids"] |
|
|
|
padding_length = self.max_len - len(ids) |
|
ids = ids + ([0] * padding_length) |
|
mask = mask + ([0] * padding_length) |
|
token_type_ids = token_type_ids + ([0] * padding_length) |
|
|
|
return { |
|
'ids': torch.tensor(ids, dtype=torch.long), |
|
'mask': torch.tensor(mask, dtype=torch.long), |
|
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), |
|
'targets': torch.tensor(self.target[item], dtype=torch.float) |
|
} |
|
|