File size: 2,788 Bytes
10fa1e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
import os
import glob


RANDOM_SEED = 42
pd.RANDOM_SEED = 42
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"]
  

@torch.no_grad()
def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
    predictions = []
    post = data[text_col]
    num_text = len(post)
    generator = range(0, num_text, text_bs)
    for i in tqdm(generator, total=len(generator), desc="Processing..."):
      texts = post[i: min(num_text, i+text_bs)].tolist()
      encoding = tokenizer(
          texts,
          add_special_tokens=True,
          max_length=max_token_len,
          return_token_type_ids=False,
          padding="max_length",
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
      logits = model(
          encoding["input_ids"].to(device),
          encoding["attention_mask"].to(device),
          return_dict=True
      ).logits
      prediction = torch.softmax(logits, dim=1)
      predictions.append(prediction.detach().cpu())

    final_pred = torch.cat(predictions, dim=0)
    y_inten = final_pred.numpy().T

    for i in range(len(LABEL_COLUMNS)):
      data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()]
    return data

@torch.no_grad()
def predict_single(sentence, tokenizer, model, device, max_token_len=128):
    encoding = tokenizer(
        sentence,
        add_special_tokens=True,
        max_length=max_token_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
      )
    logits = model(
        encoding["input_ids"].to(device),
        encoding["attention_mask"].to(device),
        return_dict=True
    ).logits
    prediction = torch.softmax(logits, dim=1)
    y_inten = prediction.flatten().cpu().numpy().T.tolist()
    y_inten = [round(i, 8) for i in y_inten]
    return y_inten



if __name__ == "__main__":

  Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv")
  Data = Data[:20]
  device = torch.device('cpu')

  # Load model directly
  tokenizer = BertTokenizer.from_pretrained("Oliver12315/Brand_Tone_of_Voice")
  model = BertForSequenceClassification.from_pretrained("Oliver12315/Brand_Tone_of_Voice")
  model = model.to(device)
  fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
  single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
  fk_doc_result.to_csv(f"output/prediction_Brand_Tone_of_Voice.csv")