Spaces:

cheesexuebao
/

murphy

Sleeping

File size: 3,701 Bytes
### install the needed package
# !pip install transformers
# !pip install torchmetrics
# !pip3 install ogb pytorch_lightning -q



import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
# import pytorch_lightning as pl

pd.set_option('display.max_columns', 500)

RANDOM_SEED = 42


class ModelTagger(nn.Module):
  def __init__(self, model_path="bert-base-uncased"):
    super().__init__()

    self.bert = BertModel.from_pretrained(model_path, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, 4)
    self.criterion = nn.BCELoss()


  def forward(self, input_ids, attention_mask, labels=None):

    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0

    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output
  

class Predict_Dataset(Dataset):
  def __init__(
    self,
    data: pd.DataFrame,
    text_col: str,
    tokenizer: BertTokenizer,
    max_token_len: int = 128
  ):
    self.text_col = text_col
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)


  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    post = data_row[self.text_col]
    encoding = self.tokenizer.encode_plus(
      post,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return dict(
      post=post,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
    )


def predict(data, text_col, tokenizer, model, device, LABEL_COLUMNS, max_token_len=128):
    predictions = []

    df_token = Predict_Dataset(data, text_col, tokenizer, max_token_len=max_token_len)
    loader = DataLoader(df_token, batch_size=1000, num_workers=0)

    for item in tqdm(loader):
        _, prediction = model(
            item["input_ids"].to(device),
            item["attention_mask"].to(device)
        )
        predictions.append(prediction.detach().cpu())

    final_pred = torch.cat(predictions, dim=0)
    y_inten = final_pred.numpy().T

    return {
        LABEL_COLUMNS[0]: y_inten[0].tolist(),
        LABEL_COLUMNS[1]: y_inten[1].tolist(),
        LABEL_COLUMNS[2]: y_inten[2].tolist(),
    LABEL_COLUMNS[3]: y_inten[3].tolist()
    }


def get_result(df, result, LABEL_COLUMNS):
  df[LABEL_COLUMNS[0]] = result[LABEL_COLUMNS[0]]
  df[LABEL_COLUMNS[1]] = result[LABEL_COLUMNS[1]]
  df[LABEL_COLUMNS[2]] = result[LABEL_COLUMNS[2]]
  df[LABEL_COLUMNS[3]] = result[LABEL_COLUMNS[3]]
  return df


Data = pd.read_csv("Kickstarter_sentence_level_5000.csv")
Data = Data[:20]
device = torch.device('cpu')
BERT_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"]

params = torch.load("checkpoints/Kickstarter.ckpt", map_location='cpu')['state_dict']
kick_model = ModelTagger()
kick_model.load_state_dict(params, strict=True)
kick_model.eval()

kick_model = kick_model.to(device)

kick_fk_doc_result = predict(Data,"content", tokenizer,kick_model, device, LABEL_COLUMNS)

fk_result = get_result(Data, kick_fk_doc_result, LABEL_COLUMNS)

fk_result.to_csv("output/prediction_origin_Kickstarter.csv")