File size: 4,311 Bytes
ecf6936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from nltk.tokenize import sent_tokenize
import pandas as pd

######################
# prerequisite:
# 1. Pip install transformer
# 2. Define tokenizer + MAX_LEN
# 3. Construct DistillBERTClass_SL class
# 4. Construct Triage_SL class
# 5. Define predict__SL class
# 6. Load model_SL & call eval()
# 7. Pre_define predict_params_SL
####################

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


import torch

"""### DataSet Class -- Triage_SL"""

from torch.utils.data import Dataset, DataLoader

class Triage_SL(Dataset):
    # initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer # load in tokenizer, used in _getitem
        self.max_len = max_len

    # The __getitem__ function loads and returns a sample from the dataset at the given index idx.
    def __getitem__(self, index):
        if index >= len(self):
          raise StopIteration
        # preprossessing sentences to standarize format as in: word+""+word
        sent = str(self.data.sentence[index])
        sent = " ".join(sent.split())
        # 1.- Split the sentence into tokens.
        # 2.- Add the special [CLS] and [SEP] tokens.
        # 3.- Map the tokens to their IDs.
        # 4.- Pad or truncate all sentences to the same length.
        # 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
        inputs = self.tokenizer.encode_plus(
            sent,                       # Sentence to encode
            None,                       # text_pair
            add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            pad_to_max_length=True,     # Pad & truncate all sentences.
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            # 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value
#            'combined_label': self.data.combined_label[index]
        }
    # The __len__ function returns the number of samples in our dataset.
    def __len__(self):
        return self.len


# read in an essay and resturns a df in sentence level
def essay_to_sent_df(essay):
    sentences = []
    paragraphs = [l for l in essay.split('\n') if len(l) > 0]
    for para in paragraphs:
      # tokenize paragraph by "." and concatenate to sentences[]
      sentences.extend(sent_tokenize(para))
    return pd.DataFrame(sentences, columns=['sentence'])

# Defining some key variables that will be used later on in the training
MAX_LEN = 512
"""### Predefine predict_params_SL"""

PREDICT_BATCH_SIZE = 1
predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

"""### Predict Fn -- predict_SL"""

sigmoid = torch.nn.Sigmoid()

def predict_SL(model, validation_loader):
    epoch_val_outputs=[]
    cpu_device = 'cpu'
    model.eval()
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(cpu_device, dtype = torch.long)
            mask = data['mask'].to(cpu_device, dtype = torch.long)
            outputs = model(ids, mask)["logits"].squeeze() # ??squeeze??
            outputs = (sigmoid(outputs).data>0.5).float()
            epoch_val_outputs.append(outputs.item())
    return epoch_val_outputs

def predict_mainidea_sent_old(paragraph, model):
    # prepare data
    sent_df = essay_to_sent_df(paragraph)
    predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN)
    predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL)
    # load model to device
    device = 'cpu'
    model.to(device)
    # predict + roundup
    sent_label = predict_SL(model, predicting_SL_loader)
    print(sent_label)
    return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])