import pandas as pd from tqdm.auto import tqdm import torch from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification import os import glob RANDOM_SEED = 42 pd.RANDOM_SEED = 42 LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"] @torch.no_grad() def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128): predictions = [] post = data[text_col] num_text = len(post) generator = range(0, num_text, text_bs) for i in tqdm(generator, total=len(generator), desc="Processing..."): texts = post[i: min(num_text, i+text_bs)].tolist() encoding = tokenizer( texts, add_special_tokens=True, max_length=max_token_len, return_token_type_ids=False, padding="max_length", truncation=True, return_attention_mask=True, return_tensors='pt', ) logits = model( encoding["input_ids"].to(device), encoding["attention_mask"].to(device), return_dict=True ).logits prediction = torch.softmax(logits, dim=1) predictions.append(prediction.detach().cpu()) final_pred = torch.cat(predictions, dim=0) y_inten = final_pred.numpy().T for i in range(len(LABEL_COLUMNS)): data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()] return data @torch.no_grad() def predict_single(sentence, tokenizer, model, device, max_token_len=128): encoding = tokenizer( sentence, add_special_tokens=True, max_length=max_token_len, return_token_type_ids=False, padding="max_length", truncation=True, return_attention_mask=True, return_tensors='pt', ) logits = model( encoding["input_ids"].to(device), encoding["attention_mask"].to(device), return_dict=True ).logits prediction = torch.softmax(logits, dim=1) y_inten = prediction.flatten().cpu().numpy().T.tolist() y_inten = [round(i, 8) for i in y_inten] return y_inten if __name__ == "__main__": Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv") Data = Data[:20] device = torch.device('cpu') # Load model directly tokenizer = BertTokenizer.from_pretrained("Oliver12315/Brand_Tone_of_Voice") model = BertForSequenceClassification.from_pretrained("Oliver12315/Brand_Tone_of_Voice") model = model.to(device) fk_doc_result = predict_csv(Data,"content", tokenizer, model, device) single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device) fk_doc_result.to_csv(f"output/prediction_Brand_Tone_of_Voice.csv")