Spaces:
Sleeping
Sleeping
### install the needed package | |
# !pip install transformers | |
# !pip install torchmetrics | |
# !pip3 install ogb pytorch_lightning -q | |
import pandas as pd | |
from tqdm.auto import tqdm | |
import torch | |
import torch.nn as nn | |
from torch.utils.data import DataLoader, Dataset | |
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup | |
# import pytorch_lightning as pl | |
pd.set_option('display.max_columns', 500) | |
RANDOM_SEED = 42 | |
class ModelTagger(nn.Module): | |
def __init__(self, model_path="bert-base-uncased"): | |
super().__init__() | |
self.bert = BertModel.from_pretrained(model_path, return_dict=True) | |
self.classifier = nn.Linear(self.bert.config.hidden_size, 4) | |
self.criterion = nn.BCELoss() | |
def forward(self, input_ids, attention_mask, labels=None): | |
output = self.bert(input_ids, attention_mask=attention_mask) | |
output = self.classifier(output.pooler_output) | |
output = torch.sigmoid(output) | |
loss = 0 | |
if labels is not None: | |
loss = self.criterion(output, labels) | |
return loss, output | |
class Predict_Dataset(Dataset): | |
def __init__( | |
self, | |
data: pd.DataFrame, | |
text_col: str, | |
tokenizer: BertTokenizer, | |
max_token_len: int = 128 | |
): | |
self.text_col = text_col | |
self.tokenizer = tokenizer | |
self.data = data | |
self.max_token_len = max_token_len | |
def __len__(self): | |
return len(self.data) | |
def __getitem__(self, index: int): | |
data_row = self.data.iloc[index] | |
post = data_row[self.text_col] | |
encoding = self.tokenizer.encode_plus( | |
post, | |
add_special_tokens=True, | |
max_length=self.max_token_len, | |
return_token_type_ids=False, | |
padding="max_length", | |
truncation=True, | |
return_attention_mask=True, | |
return_tensors='pt', | |
) | |
return dict( | |
post=post, | |
input_ids=encoding["input_ids"].flatten(), | |
attention_mask=encoding["attention_mask"].flatten(), | |
) | |
def predict(data, text_col, tokenizer, model, device, LABEL_COLUMNS, max_token_len=128): | |
predictions = [] | |
df_token = Predict_Dataset(data, text_col, tokenizer, max_token_len=max_token_len) | |
loader = DataLoader(df_token, batch_size=1000, num_workers=0) | |
for item in tqdm(loader): | |
_, prediction = model( | |
item["input_ids"].to(device), | |
item["attention_mask"].to(device) | |
) | |
predictions.append(prediction.detach().cpu()) | |
final_pred = torch.cat(predictions, dim=0) | |
y_inten = final_pred.numpy().T | |
return { | |
LABEL_COLUMNS[0]: y_inten[0].tolist(), | |
LABEL_COLUMNS[1]: y_inten[1].tolist(), | |
LABEL_COLUMNS[2]: y_inten[2].tolist(), | |
LABEL_COLUMNS[3]: y_inten[3].tolist() | |
} | |
def get_result(df, result, LABEL_COLUMNS): | |
df[LABEL_COLUMNS[0]] = result[LABEL_COLUMNS[0]] | |
df[LABEL_COLUMNS[1]] = result[LABEL_COLUMNS[1]] | |
df[LABEL_COLUMNS[2]] = result[LABEL_COLUMNS[2]] | |
df[LABEL_COLUMNS[3]] = result[LABEL_COLUMNS[3]] | |
return df | |
Data = pd.read_csv("Kickstarter_sentence_level_5000.csv") | |
Data = Data[:20] | |
device = torch.device('cpu') | |
BERT_MODEL_NAME = 'bert-base-uncased' | |
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME) | |
LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"] | |
params = torch.load("checkpoints/Kickstarter.ckpt", map_location='cpu')['state_dict'] | |
kick_model = ModelTagger() | |
kick_model.load_state_dict(params, strict=True) | |
kick_model.eval() | |
kick_model = kick_model.to(device) | |
kick_fk_doc_result = predict(Data,"content", tokenizer,kick_model, device, LABEL_COLUMNS) | |
fk_result = get_result(Data, kick_fk_doc_result, LABEL_COLUMNS) | |
fk_result.to_csv("output/prediction_origin_Kickstarter.csv") | |
# tab_output = gr.Label(label='Probability Predictions:', value=dict(zip(LABEL_COLUMNS, [0]*len(LABEL_COLUMNS)))) |