Spaces:
Sleeping
Sleeping
File size: 3,174 Bytes
8518918 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
import os
import glob
RANDOM_SEED = 42
pd.RANDOM_SEED = 42
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"]
@torch.no_grad()
def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
predictions = []
post = data[text_col]
num_text = len(post)
generator = range(0, num_text, text_bs)
for i in tqdm(generator, total=len(generator), desc="Processing..."):
texts = post[i: min(num_text, i+text_bs)].tolist()
encoding = tokenizer(
texts,
add_special_tokens=True,
max_length=max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
logits = model(
encoding["input_ids"].to(device),
encoding["attention_mask"].to(device),
return_dict=True
).logits
prediction = torch.sigmoid(logits)
predictions.append(prediction.detach().cpu())
final_pred = torch.cat(predictions, dim=0)
y_inten = final_pred.numpy().T
data[LABEL_COLUMNS[0]] = y_inten[0].tolist()
data[LABEL_COLUMNS[1]] = y_inten[1].tolist()
data[LABEL_COLUMNS[2]] = y_inten[2].tolist()
data[LABEL_COLUMNS[3]] = y_inten[3].tolist()
return data
@torch.no_grad()
def predict_single(sentence, tokenizer, model, device, max_token_len=128):
encoding = tokenizer(
sentence,
add_special_tokens=True,
max_length=max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
logits = model(
encoding["input_ids"].to(device),
encoding["attention_mask"].to(device),
return_dict=True
).logits
prediction = torch.sigmoid(logits)
y_inten = prediction.flatten().cpu().numpy().T.tolist()
return y_inten
def model_factory(local_path, device):
manager = {}
for model_path in glob.glob(f"{local_path}/*"):
base_name = os.path.basename(model_path)
model_name = os.path.splitext(base_name)[0]
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model = model.to(device)
manager[model_name] = {
"model": model,
"tokenizer": tokenizer
}
return manager
if __name__ == "__main__":
Data = pd.read_csv("Kickstarter_sentence_level_5000.csv")
Data = Data[:20]
device = torch.device('cpu')
manager = model_factory("./models", device)
for model_name, dct in manager.items():
model, tokenizer = dct['model'], dct['tokenizer']
fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
fk_doc_result.to_csv(f"output/prediction_{model_name}.csv") |