Spaces:

cheesexuebao
/

murphy

Sleeping

File size: 3,174 Bytes
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
import os
import glob


RANDOM_SEED = 42
pd.RANDOM_SEED = 42
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"]
  

@torch.no_grad()
def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
    predictions = []
    post = data[text_col]
    num_text = len(post)
    generator = range(0, num_text, text_bs)
    for i in tqdm(generator, total=len(generator), desc="Processing..."):
      texts = post[i: min(num_text, i+text_bs)].tolist()
      encoding = tokenizer(
          texts,
          add_special_tokens=True,
          max_length=max_token_len,
          return_token_type_ids=False,
          padding="max_length",
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
      logits = model(
          encoding["input_ids"].to(device),
          encoding["attention_mask"].to(device),
          return_dict=True
      ).logits
      prediction = torch.sigmoid(logits)
      predictions.append(prediction.detach().cpu())

    final_pred = torch.cat(predictions, dim=0)
    y_inten = final_pred.numpy().T

    data[LABEL_COLUMNS[0]] = y_inten[0].tolist()
    data[LABEL_COLUMNS[1]] = y_inten[1].tolist()
    data[LABEL_COLUMNS[2]] = y_inten[2].tolist()
    data[LABEL_COLUMNS[3]] = y_inten[3].tolist()
    return data

@torch.no_grad()
def predict_single(sentence, tokenizer, model, device, max_token_len=128):
    encoding = tokenizer(
        sentence,
        add_special_tokens=True,
        max_length=max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
      )
    logits = model(
        encoding["input_ids"].to(device),
        encoding["attention_mask"].to(device),
        return_dict=True
    ).logits
    prediction = torch.sigmoid(logits)
    y_inten = prediction.flatten().cpu().numpy().T.tolist()
    return y_inten

def model_factory(local_path, device):
  manager = {}
  for model_path in glob.glob(f"{local_path}/*"):
    base_name = os.path.basename(model_path)
    model_name = os.path.splitext(base_name)[0]
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    model = model.to(device)
    manager[model_name] = {
       "model": model,
       "tokenizer": tokenizer
    }
  return manager


if __name__ == "__main__":

  Data = pd.read_csv("Kickstarter_sentence_level_5000.csv")
  Data = Data[:20]
  device = torch.device('cpu')

  manager = model_factory("./models", device)
  for model_name, dct in manager.items():
    model, tokenizer = dct['model'], dct['tokenizer']
    fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
    single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
    fk_doc_result.to_csv(f"output/prediction_{model_name}.csv")