Spaces:

yutingg
/

essay-main-idea

Sleeping

File size: 4,311 Bytes

ecf6936

from nltk.tokenize import sent_tokenize
import pandas as pd

######################
# prerequisite:
# 1. Pip install transformer
# 2. Define tokenizer + MAX_LEN
# 3. Construct DistillBERTClass_SL class
# 4. Construct Triage_SL class
# 5. Define predict__SL class
# 6. Load model_SL & call eval()
# 7. Pre_define predict_params_SL
####################

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


import torch

"""### DataSet Class -- Triage_SL"""

from torch.utils.data import Dataset, DataLoader

class Triage_SL(Dataset):
    # initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer # load in tokenizer, used in _getitem
        self.max_len = max_len

    # The __getitem__ function loads and returns a sample from the dataset at the given index idx.
    def __getitem__(self, index):
        if index >= len(self):
          raise StopIteration
        # preprossessing sentences to standarize format as in: word+""+word
        sent = str(self.data.sentence[index])
        sent = " ".join(sent.split())
        # 1.- Split the sentence into tokens.
        # 2.- Add the special [CLS] and [SEP] tokens.
        # 3.- Map the tokens to their IDs.
        # 4.- Pad or truncate all sentences to the same length.
        # 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
        inputs = self.tokenizer.encode_plus(
            sent,                       # Sentence to encode
            None,                       # text_pair
            add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            pad_to_max_length=True,     # Pad & truncate all sentences.
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            # 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value
#            'combined_label': self.data.combined_label[index]
        }
    # The __len__ function returns the number of samples in our dataset.
    def __len__(self):
        return self.len


# read in an essay and resturns a df in sentence level
def essay_to_sent_df(essay):
    sentences = []
    paragraphs = [l for l in essay.split('\n') if len(l) > 0]
    for para in paragraphs:
      # tokenize paragraph by "." and concatenate to sentences[]
      sentences.extend(sent_tokenize(para))
    return pd.DataFrame(sentences, columns=['sentence'])

# Defining some key variables that will be used later on in the training
MAX_LEN = 512
"""### Predefine predict_params_SL"""

PREDICT_BATCH_SIZE = 1
predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

"""### Predict Fn -- predict_SL"""

sigmoid = torch.nn.Sigmoid()

def predict_SL(model, validation_loader):
    epoch_val_outputs=[]
    cpu_device = 'cpu'
    model.eval()
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(cpu_device, dtype = torch.long)
            mask = data['mask'].to(cpu_device, dtype = torch.long)
            outputs = model(ids, mask)["logits"].squeeze() # ??squeeze??
            outputs = (sigmoid(outputs).data>0.5).float()
            epoch_val_outputs.append(outputs.item())
    return epoch_val_outputs

def predict_mainidea_sent_old(paragraph, model):
    # prepare data
    sent_df = essay_to_sent_df(paragraph)
    predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN)
    predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL)
    # load model to device
    device = 'cpu'
    model.to(device)
    # predict + roundup
    sent_label = predict_SL(model, predicting_SL_loader)
    print(sent_label)
    return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])