import pandas as pd from tqdm.auto import tqdm import torch from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification import os import glob RANDOM_SEED = 42 pd.RANDOM_SEED = 42 LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"] @torch.no_grad() def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128): predictions = [] post = data[text_col] num_text = len(post) generator = range(0, num_text, text_bs) for i in tqdm(generator, total=len(generator), desc="Processing..."): texts = post[i: min(num_text, i+text_bs)].tolist() encoding = tokenizer( texts, add_special_tokens=True, max_length=max_token_len, return_token_type_ids=False, padding="max_length", truncation=True, return_attention_mask=True, return_tensors='pt', ) logits = model( encoding["input_ids"].to(device), encoding["attention_mask"].to(device), return_dict=True ).logits prediction = torch.softmax(logits, dim=1) predictions.append(prediction.detach().cpu()) final_pred = torch.cat(predictions, dim=0) y_inten = final_pred.numpy().T for i in range(len(LABEL_COLUMNS)): data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()] return data @torch.no_grad() def predict_single(sentence, tokenizer, model, device, max_token_len=128): encoding = tokenizer( sentence, add_special_tokens=True, max_length=max_token_len, return_token_type_ids=False, padding="max_length", truncation=True, return_attention_mask=True, return_tensors='pt', ) logits = model( encoding["input_ids"].to(device), encoding["attention_mask"].to(device), return_dict=True ).logits prediction = torch.softmax(logits, dim=1) y_inten = prediction.flatten().cpu().numpy().T.tolist() y_inten = [round(i, 8) for i in y_inten] return y_inten def model_factory(local_path, device): manager = {} for model_path in glob.glob(f"{local_path}/*"): base_name = os.path.basename(model_path) model_name = os.path.splitext(base_name)[0] tokenizer = BertTokenizer.from_pretrained(model_path) model = BertForSequenceClassification.from_pretrained(model_path) model = model.to(device) manager[model_name] = { "model": model, "tokenizer": tokenizer } return manager if __name__ == "__main__": Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv") Data = Data[:20] device = torch.device('cpu') manager = model_factory("./models", device) for model_name, dct in manager.items(): model, tokenizer = dct['model'], dct['tokenizer'] fk_doc_result = predict_csv(Data,"content", tokenizer, model, device) single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device) fk_doc_result.to_csv(f"output/prediction_{model_name}.csv")