Spaces:
Sleeping
Sleeping
import pandas as pd | |
from tqdm.auto import tqdm | |
import torch | |
from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification | |
import os | |
import glob | |
RANDOM_SEED = 42 | |
pd.RANDOM_SEED = 42 | |
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"] | |
def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128): | |
predictions = [] | |
post = data[text_col] | |
num_text = len(post) | |
generator = range(0, num_text, text_bs) | |
for i in tqdm(generator, total=len(generator), desc="Processing..."): | |
texts = post[i: min(num_text, i+text_bs)].tolist() | |
encoding = tokenizer( | |
texts, | |
add_special_tokens=True, | |
max_length=max_token_len, | |
return_token_type_ids=False, | |
padding="max_length", | |
truncation=True, | |
return_attention_mask=True, | |
return_tensors='pt', | |
) | |
logits = model( | |
encoding["input_ids"].to(device), | |
encoding["attention_mask"].to(device), | |
return_dict=True | |
).logits | |
prediction = torch.softmax(logits, dim=1) | |
predictions.append(prediction.detach().cpu()) | |
final_pred = torch.cat(predictions, dim=0) | |
y_inten = final_pred.numpy().T | |
for i in range(len(LABEL_COLUMNS)): | |
data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()] | |
return data | |
def predict_single(sentence, tokenizer, model, device, max_token_len=128): | |
encoding = tokenizer( | |
sentence, | |
add_special_tokens=True, | |
max_length=max_token_len, | |
return_token_type_ids=False, | |
padding="max_length", | |
truncation=True, | |
return_attention_mask=True, | |
return_tensors='pt', | |
) | |
logits = model( | |
encoding["input_ids"].to(device), | |
encoding["attention_mask"].to(device), | |
return_dict=True | |
).logits | |
prediction = torch.softmax(logits, dim=1) | |
y_inten = prediction.flatten().cpu().numpy().T.tolist() | |
y_inten = [round(i, 8) for i in y_inten] | |
return y_inten | |
def model_factory(local_path, device): | |
manager = {} | |
for model_path in glob.glob(f"{local_path}/*"): | |
base_name = os.path.basename(model_path) | |
model_name = os.path.splitext(base_name)[0] | |
tokenizer = BertTokenizer.from_pretrained(model_path) | |
model = BertForSequenceClassification.from_pretrained(model_path) | |
model = model.to(device) | |
manager[model_name] = { | |
"model": model, | |
"tokenizer": tokenizer | |
} | |
return manager | |
if __name__ == "__main__": | |
Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv") | |
Data = Data[:20] | |
device = torch.device('cpu') | |
manager = model_factory("./models", device) | |
for model_name, dct in manager.items(): | |
model, tokenizer = dct['model'], dct['tokenizer'] | |
fk_doc_result = predict_csv(Data,"content", tokenizer, model, device) | |
single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device) | |
fk_doc_result.to_csv(f"output/prediction_{model_name}.csv") |