murphy / Prediction.py
cheesexuebao's picture
Modify tables
74b913c
raw
history blame
3.17 kB
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
import os
import glob
RANDOM_SEED = 42
pd.RANDOM_SEED = 42
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"]
@torch.no_grad()
def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
predictions = []
post = data[text_col]
num_text = len(post)
generator = range(0, num_text, text_bs)
for i in tqdm(generator, total=len(generator), desc="Processing..."):
texts = post[i: min(num_text, i+text_bs)].tolist()
encoding = tokenizer(
texts,
add_special_tokens=True,
max_length=max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
logits = model(
encoding["input_ids"].to(device),
encoding["attention_mask"].to(device),
return_dict=True
).logits
prediction = torch.softmax(logits, dim=1)
predictions.append(prediction.detach().cpu())
final_pred = torch.cat(predictions, dim=0)
y_inten = final_pred.numpy().T
for i in range(len(LABEL_COLUMNS)):
data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()]
return data
@torch.no_grad()
def predict_single(sentence, tokenizer, model, device, max_token_len=128):
encoding = tokenizer(
sentence,
add_special_tokens=True,
max_length=max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
logits = model(
encoding["input_ids"].to(device),
encoding["attention_mask"].to(device),
return_dict=True
).logits
prediction = torch.softmax(logits, dim=1)
y_inten = prediction.flatten().cpu().numpy().T.tolist()
y_inten = [round(i, 8) for i in y_inten]
return y_inten
def model_factory(local_path, device):
manager = {}
for model_path in glob.glob(f"{local_path}/*"):
base_name = os.path.basename(model_path)
model_name = os.path.splitext(base_name)[0]
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model = model.to(device)
manager[model_name] = {
"model": model,
"tokenizer": tokenizer
}
return manager
if __name__ == "__main__":
Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv")
Data = Data[:20]
device = torch.device('cpu')
manager = model_factory("./models", device)
for model_name, dct in manager.items():
model, tokenizer = dct['model'], dct['tokenizer']
fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
fk_doc_result.to_csv(f"output/prediction_{model_name}.csv")