import os import sys import pickle import random import copy import numpy as np import torch import torch.nn as nn from tqdm import tqdm, trange from collections import OrderedDict from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup from utils import Config, Logger, make_log_dir from modeling import ( AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSequenceClassification_SPV, AutoModelForSequenceClassification_MIP, AutoModelForSequenceClassification_SPV_MIP, ) from run_classifier_dataset_utils import processors, output_modes, compute_metrics from data_loader import load_train_data, load_train_data_kf, load_test_data CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" ARGS_NAME = "training_args.bin" def main(): # read configs config = Config(main_conf_path="./") # apply system arguments if exist argv = sys.argv[1:] if len(argv) > 0: cmd_arg = OrderedDict() argvs = " ".join(sys.argv[1:]).split(" ") for i in range(0, len(argvs), 2): arg_name, arg_value = argvs[i], argvs[i + 1] arg_name = arg_name.strip("-") cmd_arg[arg_name] = arg_value config.update_params(cmd_arg) args = config print(args.__dict__) # logger if "saves" in args.bert_model: log_dir = args.bert_model logger = Logger(log_dir) config = Config(main_conf_path=log_dir) old_args = copy.deepcopy(args) args.__dict__.update(config.__dict__) args.bert_model = old_args.bert_model args.do_train = old_args.do_train args.data_dir = old_args.data_dir args.task_name = old_args.task_name # apply system arguments if exist argv = sys.argv[1:] if len(argv) > 0: cmd_arg = OrderedDict() argvs = " ".join(sys.argv[1:]).split(" ") for i in range(0, len(argvs), 2): arg_name, arg_value = argvs[i], argvs[i + 1] arg_name = arg_name.strip("-") cmd_arg[arg_name] = arg_value config.update_params(cmd_arg) else: if not os.path.exists("saves"): os.mkdir("saves") log_dir = make_log_dir(os.path.join("saves", args.bert_model)) logger = Logger(log_dir) config.save(log_dir) args.log_dir = log_dir # set CUDA devices device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device logger.info("device: {} n_gpu: {}".format(device, args.n_gpu)) # set seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # get dataset and processor task_name = args.task_name.lower() processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() args.num_labels = len(label_list) # build tokenizer and model tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = load_pretrained_model(args) ########### Training ########### # VUA18 / VUA20 for bagging if args.do_train and args.task_name == "vua" and args.num_bagging: train_data, gkf = load_train_data_kf(args, logger, processor, task_name, label_list, tokenizer, output_mode) for fold, (train_idx, valid_idx) in enumerate(tqdm(gkf, desc="bagging...")): if fold != args.bagging_index: continue print(f"bagging_index = {args.bagging_index}") # Load data temp_train_data = TensorDataset(*train_data[train_idx]) train_sampler = RandomSampler(temp_train_data) train_dataloader = DataLoader(temp_train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Reset Model model = load_pretrained_model(args) model, best_result = run_train(args, logger, model, train_dataloader, processor, task_name, label_list, tokenizer, output_mode) # Test all_guids, eval_dataloader = load_test_data(args, logger, processor, task_name, label_list, tokenizer, output_mode) preds = run_eval(args, logger, model, eval_dataloader, all_guids, task_name, return_preds=True) with open(os.path.join(args.data_dir, f"seed{args.seed}_preds_{fold}.p"), "wb") as f: pickle.dump(preds, f) # If train data is VUA20, the model needs to be tested on VUAverb as well. # You can just adjust the names of data_dir in conditions below for your own data directories. if "VUA20" in args.data_dir: # Verb args.data_dir = "data/VUAverb" all_guids, eval_dataloader = load_test_data(args, logger, processor, task_name, label_list, tokenizer, output_mode) preds = run_eval(args, logger, model, eval_dataloader, all_guids, task_name, return_preds=True) with open(os.path.join(args.data_dir, f"seed{args.seed}_preds_{fold}.p"), "wb") as f: pickle.dump(preds, f) logger.info(f"Saved to {logger.log_dir}") return # VUA18 / VUA20 if args.do_train and args.task_name == "vua": train_dataloader = load_train_data( args, logger, processor, task_name, label_list, tokenizer, output_mode ) model, best_result = run_train( args, logger, model, train_dataloader, processor, task_name, label_list, tokenizer, output_mode, ) # TroFi / MOH-X (K-fold) elif args.do_train and args.task_name == "trofi": k_result = [] for k in tqdm(range(args.kfold), desc="K-fold"): model = load_pretrained_model(args) train_dataloader = load_train_data( args, logger, processor, task_name, label_list, tokenizer, output_mode, k ) model, best_result = run_train( args, logger, model, train_dataloader, processor, task_name, label_list, tokenizer, output_mode, k, ) k_result.append(best_result) # Calculate average result avg_result = copy.deepcopy(k_result[0]) for result in k_result[1:]: for k, v in result.items(): avg_result[k] += v for k, v in avg_result.items(): avg_result[k] /= len(k_result) logger.info(f"-----Averge Result-----") for key in sorted(avg_result.keys()): logger.info(f" {key} = {str(avg_result[key])}") # Load trained model if "saves" in args.bert_model: model = load_trained_model(args, model, tokenizer) ########### Inference ########### # VUA18 / VUA20 if (args.do_eval or args.do_test) and task_name == "vua": # if test data is genre or POS tag data if ("genre" in args.data_dir) or ("pos" in args.data_dir): if "genre" in args.data_dir: targets = ["acad", "conv", "fict", "news"] elif "pos" in args.data_dir: targets = ["adj", "adv", "noun", "verb"] orig_data_dir = args.data_dir for idx, target in tqdm(enumerate(targets)): logger.info(f"====================== Evaluating {target} =====================") args.data_dir = os.path.join(orig_data_dir, target) all_guids, eval_dataloader = load_test_data( args, logger, processor, task_name, label_list, tokenizer, output_mode ) run_eval(args, logger, model, eval_dataloader, all_guids, task_name) else: all_guids, eval_dataloader = load_test_data( args, logger, processor, task_name, label_list, tokenizer, output_mode ) run_eval(args, logger, model, eval_dataloader, all_guids, task_name) # TroFi / MOH-X (K-fold) elif (args.do_eval or args.do_test) and args.task_name == "trofi": logger.info(f"***** Evaluating with {args.data_dir}") k_result = [] for k in tqdm(range(10), desc="K-fold"): all_guids, eval_dataloader = load_test_data( args, logger, processor, task_name, label_list, tokenizer, output_mode, k ) result = run_eval(args, logger, model, eval_dataloader, all_guids, task_name) k_result.append(result) # Calculate average result avg_result = copy.deepcopy(k_result[0]) for result in k_result[1:]: for k, v in result.items(): avg_result[k] += v for k, v in avg_result.items(): avg_result[k] /= len(k_result) logger.info(f"-----Averge Result-----") for key in sorted(avg_result.keys()): logger.info(f" {key} = {str(avg_result[key])}") logger.info(f"Saved to {logger.log_dir}") def run_train( args, logger, model, train_dataloader, processor, task_name, label_list, tokenizer, output_mode, k=None, ): tr_loss = 0 num_train_optimization_steps = len(train_dataloader) * args.num_train_epoch # Prepare optimizer, scheduler param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) if args.lr_schedule != False or args.lr_schedule.lower() != "none": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(args.warmup_epoch * len(train_dataloader)), num_training_steps=num_train_optimization_steps, ) logger.info("***** Running training *****") logger.info(f" Batch size = {args.train_batch_size}") logger.info(f" Num steps = { num_train_optimization_steps}") # Run training model.train() max_val_f1 = -1 max_result = {} for epoch in trange(int(args.num_train_epoch), desc="Epoch"): tr_loss = 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): # move batch data to gpu batch = tuple(t.to(args.device) for t in batch) if args.model_type in ["MELBERT_MIP", "MELBERT"]: ( input_ids, input_mask, segment_ids, label_ids, input_ids_2, input_mask_2, segment_ids_2, ) = batch else: input_ids, input_mask, segment_ids, label_ids = batch # compute loss values if args.model_type in ["BERT_SEQ", "BERT_BASE", "MELBERT_SPV"]: logits = model( input_ids, target_mask=(segment_ids == 1), token_type_ids=segment_ids, attention_mask=input_mask, ) loss_fct = nn.NLLLoss(weight=torch.Tensor([1, args.class_weight]).to(args.device)) loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1)) elif args.model_type in ["MELBERT_MIP", "MELBERT"]: logits = model( input_ids, input_ids_2, target_mask=(segment_ids == 1), target_mask_2=segment_ids_2, attention_mask_2=input_mask_2, token_type_ids=segment_ids, attention_mask=input_mask, ) loss_fct = nn.NLLLoss(weight=torch.Tensor([1, args.class_weight]).to(args.device)) loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1)) # average loss if on multi-gpu. if args.n_gpu > 1: loss = loss.mean() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() if args.lr_schedule != False or args.lr_schedule.lower() != "none": scheduler.step() optimizer.zero_grad() tr_loss += loss.item() cur_lr = optimizer.param_groups[0]["lr"] logger.info(f"[epoch {epoch+1}] ,lr: {cur_lr} ,tr_loss: {tr_loss}") # evaluate if args.do_eval: all_guids, eval_dataloader = load_test_data( args, logger, processor, task_name, label_list, tokenizer, output_mode, k ) result = run_eval(args, logger, model, eval_dataloader, all_guids, task_name) # update if result["f1"] > max_val_f1: max_val_f1 = result["f1"] max_result = result if args.task_name == "trofi": save_model(args, model, tokenizer) if args.task_name == "vua": save_model(args, model, tokenizer) logger.info(f"-----Best Result-----") for key in sorted(max_result.keys()): logger.info(f" {key} = {str(max_result[key])}") return model, max_result def run_eval(args, logger, model, eval_dataloader, all_guids, task_name, return_preds=False): model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] pred_guids = [] out_label_ids = None for eval_batch in tqdm(eval_dataloader, desc="Evaluating"): eval_batch = tuple(t.to(args.device) for t in eval_batch) if args.model_type in ["MELBERT_MIP", "MELBERT"]: ( input_ids, input_mask, segment_ids, label_ids, idx, input_ids_2, input_mask_2, segment_ids_2, ) = eval_batch else: input_ids, input_mask, segment_ids, label_ids, idx = eval_batch with torch.no_grad(): # compute loss values if args.model_type in ["BERT_BASE", "BERT_SEQ", "MELBERT_SPV"]: logits = model( input_ids, target_mask=(segment_ids == 1), token_type_ids=segment_ids, attention_mask=input_mask, ) loss_fct = nn.NLLLoss() tmp_eval_loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) pred_guids.append([all_guids[i] for i in idx]) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) pred_guids[0].extend([all_guids[i] for i in idx]) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0 ) elif args.model_type in ["MELBERT_MIP", "MELBERT"]: logits = model( input_ids, input_ids_2, target_mask=(segment_ids == 1), target_mask_2=segment_ids_2, attention_mask_2=input_mask_2, token_type_ids=segment_ids, attention_mask=input_mask, ) loss_fct = nn.NLLLoss() tmp_eval_loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) pred_guids.append([all_guids[i] for i in idx]) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) pred_guids[0].extend([all_guids[i] for i in idx]) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0 ) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) # compute metrics result = compute_metrics(preds, out_label_ids) for key in sorted(result.keys()): logger.info(f" {key} = {str(result[key])}") if return_preds: return preds return result def load_pretrained_model(args): # Pretrained Model bert = AutoModel.from_pretrained(args.bert_model) for name, param in bert.named_parameters(): print(name, param.requires_grad) config = bert.config config.type_vocab_size = 4 if "albert" in args.bert_model: bert.embeddings.token_type_embeddings = nn.Embedding( config.type_vocab_size, config.embedding_size ) else: bert.embeddings.token_type_embeddings = nn.Embedding( config.type_vocab_size, config.hidden_size ) bert._init_weights(bert.embeddings.token_type_embeddings) # Additional Layers if args.model_type in ["BERT_BASE"]: model = AutoModelForSequenceClassification( args=args, Model=bert, config=config, num_labels=args.num_labels ) if args.model_type == "BERT_SEQ": model = AutoModelForTokenClassification( args=args, Model=bert, config=config, num_labels=args.num_labels ) if args.model_type == "MELBERT_SPV": model = AutoModelForSequenceClassification_SPV( args=args, Model=bert, config=config, num_labels=args.num_labels ) if args.model_type == "MELBERT_MIP": model = AutoModelForSequenceClassification_MIP( args=args, Model=bert, config=config, num_labels=args.num_labels ) if args.model_type == "MELBERT": model = AutoModelForSequenceClassification_SPV_MIP( args=args, Model=bert, config=config, num_labels=args.num_labels ) model.to(args.device) if args.n_gpu > 1 and not args.no_cuda: model = torch.nn.DataParallel(model) return model def save_model(args, model, tokenizer): model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.log_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.log_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.log_dir) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.log_dir, ARGS_NAME) torch.save(args, output_args_file) def load_trained_model(args, model, tokenizer): # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.log_dir, WEIGHTS_NAME) if hasattr(model, "module"): model.module.load_state_dict(torch.load(output_model_file, map_location=args.device)) else: model.load_state_dict(torch.load(output_model_file, map_location=args.device)) return model if __name__ == "__main__": main()