Spaces:

cheesexuebao
/

murphy

Sleeping

App Files Files Community

murphy / Prediction.py

cheesexuebao

Modify tables

74b913c 11 months ago

raw

history blame

3.17 kB

	import pandas as pd
	from tqdm.auto import tqdm
	import torch
	from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
	import os
	import glob


	RANDOM_SEED = 42
	pd.RANDOM_SEED = 42
	LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"]


	@torch.no_grad()
	def predict_csv(data, text_col, tokenizer, model, device, text_bs=16, max_token_len=128):
	predictions = []
	post = data[text_col]
	num_text = len(post)
	generator = range(0, num_text, text_bs)
	for i in tqdm(generator, total=len(generator), desc="Processing..."):
	texts = post[i: min(num_text, i+text_bs)].tolist()
	encoding = tokenizer(
	texts,
	add_special_tokens=True,
	max_length=max_token_len,
	return_token_type_ids=False,
	padding="max_length",
	truncation=True,
	return_attention_mask=True,
	return_tensors='pt',
	)
	logits = model(
	encoding["input_ids"].to(device),
	encoding["attention_mask"].to(device),
	return_dict=True
	).logits
	prediction = torch.softmax(logits, dim=1)
	predictions.append(prediction.detach().cpu())

	final_pred = torch.cat(predictions, dim=0)
	y_inten = final_pred.numpy().T

	for i in range(len(LABEL_COLUMNS)):
	data[LABEL_COLUMNS[i]] = [round(i, 8) for i in y_inten[i].tolist()]
	return data

	@torch.no_grad()
	def predict_single(sentence, tokenizer, model, device, max_token_len=128):
	encoding = tokenizer(
	sentence,
	add_special_tokens=True,
	max_length=max_token_len,
	return_token_type_ids=False,
	padding="max_length",
	truncation=True,
	return_attention_mask=True,
	return_tensors='pt',
	)
	logits = model(
	encoding["input_ids"].to(device),
	encoding["attention_mask"].to(device),
	return_dict=True
	).logits
	prediction = torch.softmax(logits, dim=1)
	y_inten = prediction.flatten().cpu().numpy().T.tolist()
	y_inten = [round(i, 8) for i in y_inten]
	return y_inten

	def model_factory(local_path, device):
	manager = {}
	for model_path in glob.glob(f"{local_path}/*"):
	base_name = os.path.basename(model_path)
	model_name = os.path.splitext(base_name)[0]
	tokenizer = BertTokenizer.from_pretrained(model_path)
	model = BertForSequenceClassification.from_pretrained(model_path)
	model = model.to(device)
	manager[model_name] = {
	"model": model,
	"tokenizer": tokenizer
	}
	return manager


	if __name__ == "__main__":

	Data = pd.read_csv("assets/Kickstarter_sentence_level_5000.csv")
	Data = Data[:20]
	device = torch.device('cpu')

	manager = model_factory("./models", device)
	for model_name, dct in manager.items():
	model, tokenizer = dct['model'], dct['tokenizer']
	fk_doc_result = predict_csv(Data,"content", tokenizer, model, device)
	single_response = predict_single("Games of the imagination teach us actions have consequences in a realm that can be reset.", tokenizer, model, device)
	fk_doc_result.to_csv(f"output/prediction_{model_name}.csv")