diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c3875fbc435c83968ce93faeba9a70338d202e3f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.json filter=lfs diff=lfs merge=lfs -text +*.log filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2812bcc7000ec0198a56d5c057701c81720e5d06 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*__pycache__ diff --git a/README.md b/README.md index 7be5fc7f47d5db027d120b8024982df93db95b74..96a3416a020a0c2cbfee440d7066b2bcb4bafe94 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,47 @@ ---- -license: mit ---- +--- +license: mit +datasets: +- axgroup/Ranking_TVR +language: +- en +--- +# CONQUER_RVMR + +This repository contains the XML model for the baseline of the Ranked Video Moment Retrieval (RVMR) task. The associated paper is titled "Video Moment Retrieval in Practical Setting: A Dataset of Ranked Moments for Imprecise Queries." + +The main repository of the paper is [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking), and this model is adapted from [CONQUER](https://github.com/houzhijian/CONQUER.git). The environment setup is the same as for RelocNet_RVMR, as detailed in the [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking) repository. + + +CONQUER leverages video retrieval results from [HERO](https://github.com/linjieli222/HERO.git). We continue to use these +results when training on our TVR-Ranking dataset. Note that, because the HERO results are obtained from the TVR dataset, there could be a data leak issue in our task setting. However, this issue is negligible for two reasons: (i) the queries used in our setting is imprecise query with query re-written, and (ii) a query has multiple ground truth moments in our task setting, which was not annotated in the original TVR dataset. + + +## Performance + + +| **Model** | **Train Set Top N** | **IoU=0.3** | | **IoU=0.5** | | **IoU=0.7** | | +|------------|---------------------|-------------|----------|-------------|----------|-------------|----------| +| | | **Val** | **Test** | **Val** | **Test** | **Val** | **Test** | +| **NDCG@10**| | | | | | | | +| CONQUER | 1 | 0.0999 | 0.0859 | 0.0844 | 0.0709 | 0.0530 | 0.0512 | +| CONQUER | 20 | 0.2406 | 0.2249 | 0.2222 | 0.2104 | 0.1672 | 0.1517 | +| CONQUER | 40 | 0.2450 | 0.2219 | 0.2262 | 0.2085 | 0.1670 | 0.1515 | +| **NDCG@20**| | | | | | | | +| CONQUER | 1 | 0.0952 | 0.0835 | 0.0808 | 0.0687 | 0.0526 | 0.0484 | +| CONQUER | 20 | 0.2130 | 0.1995 | 0.1976 | 0.1867 | 0.1527 | 0.1368 | +| CONQUER | 40 | 0.2183 | 0.1968 | 0.2022 | 0.1851 | 0.1524 | 0.1365 | +| **NDCG@40**| | | | | | | | +| CONQUER | 1 | 0.0974 | 0.0866 | 0.0832 | 0.0718 | 0.0557 | 0.0510 | +| CONQUER | 20 | 0.2029 | 0.1906 | 0.1891 | 0.1788 | 0.1476 | 0.1326 | +| CONQUER | 40 | 0.2080 | 0.1885 | 0.1934 | 0.1775 | 0.1473 | 0.1323 | + + +## Quick Start + +Modify the path in `run_disjoint_top20.sh` and then execute the script: + +```sh +sh run_disjoint_top20.sh +``` + +Feel free to contribute or raise issues for any problems encountered. \ No newline at end of file diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a407a816d85d1fb34422dd61a3e8d548a9541ac9 --- /dev/null +++ b/config/config.py @@ -0,0 +1,227 @@ +import os +import time +import torch +import argparse +import sys +import pprint + +import json +from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile + + +def parse_with_config(parser): + args = parser.parse_args() + if args.config is not None: + config_args = json.load(open(args.config)) + override_keys = {arg[2:].split('=')[0] for arg in sys.argv[1:] + if arg.startswith('--')} + for k, v in config_args.items(): + if k not in override_keys: + setattr(args, k, v) + del args.config + return args + + +class BaseOptions(object): + saved_option_filename = "opt.json" + ckpt_filename = "model.ckpt" + tensorboard_log_dir = "tensorboard_log" + train_log_filename = "train.log.txt" + eval_log_filename = "eval.log.txt" + + def __init__(self): + self.parser = argparse.ArgumentParser() + self.initialized = False + self.opt = None + + def initialize(self): + self.initialized = True + self.parser.add_argument("--dset_name", type=str, default="tvr", choices=["tvr", "didemo"]) + self.parser.add_argument("--eval_split_name", type=str, default="val", + help="should match keys in video_duration_idx_path, must set for VCMR") + self.parser.add_argument("--data_ratio", type=float, default=1.0, + help="how many training and eval data to use. 1.0: use all, 0.1: use 10%." + "Use small portion for debug purposes. Note this is different from --debug, " + "which works by breaking the loops, typically they are not used together.") + self.parser.add_argument("--debug", action="store_true", + help="debug (fast) mode, break all loops, do not load all data into memory.") + self.parser.add_argument("--disable_eval", action="store_true", + help="disable eval") + self.parser.add_argument("--results_root", type=str, default="results") + self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training") + self.parser.add_argument("--seed", type=int, default=2018, help="random seed") + self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu") + self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job") + self.parser.add_argument("--num_workers", type=int, default=8, + help="num subprocesses used to load the data, 0: use main process") + + # training config + self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate") + self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay") + self.parser.add_argument("--n_epoch", type=int, default=50, help="number of epochs to run") + self.parser.add_argument("--max_es_cnt", type=int, default=3, + help="number of epochs to early stop, use -1 to disable early stop") + self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+", + default=["VCMR", "SVMR", "VR"], choices=["VCMR", "SVMR", "VR"], + help="evaluate and report numbers for tasks specified here.") + self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size") + self.parser.add_argument("--eval_query_bsz", type=int, default=8, + help="mini-batch size at inference, for query") + self.parser.add_argument("--no_eval_untrained", action="store_true", help="Evaluate on un-trained model") + self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable") + self.parser.add_argument("--eval_epoch_num", type=int, default=1, help="eval_epoch_num") + + # Data config + self.parser.add_argument("--max_ctx_len", type=int, default=100, + help="max number of snippets, 100 for tvr clip_length=1.5, only 109/21825 > 100") + self.parser.add_argument("--max_desc_len", type=int, default=30, help="max number of query token") + self.parser.add_argument("--clip_length", type=float, default=1.5, + help="each video will be uniformly segmented into small clips") + self.parser.add_argument("--ctx_mode", type=str, default="visual_sub", + help="adopted modality list for each clip") + self.parser.add_argument("--dataset_config", type=str,help="data config") + + + # Model config + + self.parser.add_argument("--visual_dim", type=int,default=4352,help="visual modality feature dimension") + self.parser.add_argument("--text_dim", type=int, default=768, help="textual modality feature dimension") + self.parser.add_argument("--query_dim", type=int, default=768, help="query feature dimension") + self.parser.add_argument("--hidden_dim", type=int, default=768, help="joint dimension") + self.parser.add_argument("--no_output_moe_weight",action="store_true", + help="whether NOT to use query dependent fusion") + self.parser.add_argument("--model_config", type=str, help="model config") + + + ## Train config + self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for moment cross-entropy loss") + self.parser.add_argument("--lw_video_ce", type=float, default=0.05, help="weight for video cross-entropy loss") + self.parser.add_argument("--lr_mul", type=float, default=1, help="Learning rate multiplier for backbone module") + self.parser.add_argument("--use_extend_pool", type=int, default=1000, + help="use_extend_pool") + self.parser.add_argument("--neg_video_num",type=int,default=3, + help="sample the number of negative video, " + "if neg_video_num=0, then disable shared normalization training objective") + self.parser.add_argument("--encoder_pretrain_ckpt_filepath", type=str, + default="None", + help="first_stage_pretrain checkpoint") + self.parser.add_argument("--use_interal_vr_scores", action="store_true", + help="whether to interal_vr_scores, true only for general similarity measure function") + + ## Eval config + self.parser.add_argument("--similarity_measure", + type=str, choices=["general", "exclusive","disjoint"], + default="general",help="similarity_measure_function") + # post processing + self.parser.add_argument("--min_pred_l", type=int, default=0, + help="constrain the [st, ed] with ed - st >= 1" + "(1 clips with length 1.5 each, 1.5 secs in total" + "this is the min length for proposal-based method)") + self.parser.add_argument("--max_pred_l", type=int, default=24, + help="constrain the [st, ed] pairs with ed - st <= 24, 36 secs in total" + "(24 clips with length 1.5 each, " + "this is the max length for proposal-based method)") + self.parser.add_argument("--max_before_nms", type=int, default=200) + self.parser.add_argument("--max_vcmr_video", type=int, default=10, + help="ranking in top-max_vcmr_video") + self.parser.add_argument("--nms_thd", type=float, default=-1, + help="additionally use non-maximum suppression " + "(or non-minimum suppression for distance)" + "to post-processing the predictions. " + "-1: do not use nms. 0.7 for tvr") + self.parser.add_argument("--eval_num_per_epoch", type=float) + + # can use config files + self.parser.add_argument('--config', help='JSON config files') + self.parser.add_argument('--model_name', type=str) + + + def display_save(self, opt): + args = vars(opt) + # Display settings + # print("------------ Options -------------\n{}\n-------------------" + # .format({str(k): str(v) for k, v in sorted(args.items())})) + print("------------ Options -------------\n{}\n-------------------" + .format(pprint.pformat({str(k): str(v) for k, v in sorted(args.items())}, indent=4))) + + + # Save settings + if not isinstance(self, TestOptions): + option_file_path = os.path.join(opt.results_dir, self.saved_option_filename) # not yaml file indeed + save_json(args, option_file_path, save_pretty=True) + + + def parse(self): + if not self.initialized: + self.initialize() + opt = parse_with_config(self.parser) + + if opt.debug: + opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ]) + #opt.disable_eval = True + + if isinstance(self, TestOptions): + + # modify model_dir to absolute path + opt.model_dir = os.path.join("results", opt.model_dir) + + saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename)) + for arg in saved_options: # use saved options to overwrite all BaseOptions args. + if arg not in ["results_root", "nms_thd", "debug", "dataset_config", "model_config","device", + "eval_split_name", "bsz", "eval_context_bsz", "device_ids", + "max_vcmr_video","max_pred_l", "min_pred_l", "external_inference_vr_res_path"]: + setattr(opt, arg, saved_options[arg]) + else: + if opt.exp_id is None: + raise ValueError("--exp_id is required for at a training option!") + + opt.results_dir = os.path.join(opt.results_root, + "-".join([opt.dset_name, opt.exp_id, + time.strftime("%Y_%m_%d_%H_%M_%S")])) + mkdirp(opt.results_dir) + # save a copy of current code + code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + code_zip_filename = os.path.join(opt.results_dir, "code.zip") + make_zipfile(code_dir, code_zip_filename, + enclosing_dir="code", + exclude_dirs_substring="results", + exclude_dirs=["condor","data","results", "debug_results", "__pycache__"], + exclude_extensions=[".pyc", ".ipynb", ".swap"],) + + self.display_save(opt) + + + # assert opt.stop_task in opt.eval_tasks_at_training + opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) + opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) + opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) + opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) + opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") + + self.opt = opt + return opt + + +class TestOptions(BaseOptions): + """add additional options for evaluating""" + def initialize(self): + BaseOptions.initialize(self) + # also need to specify --eval_split_name + self.parser.add_argument("--eval_id", type=str, help="evaluation id") + self.parser.add_argument("--model_dir", type=str, + help="dir contains the model file, will be converted to absolute path afterwards") + self.parser.add_argument("--tasks", type=str, nargs="+", + choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"], + help="Which tasks to run." + "VCMR: Video Corpus Moment Retrieval;" + "SVMR: Single Video Moment Retrieval;" + "VR: regular Video Retrieval. (will be performed automatically with VCMR)") + +if __name__ == '__main__': + print(__file__) + print(os.path.realpath(__file__)) + code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + print(code_dir) \ No newline at end of file diff --git a/config/model_config.json b/config/model_config.json new file mode 100644 index 0000000000000000000000000000000000000000..02d3eeb3cfdbbee1fb0562df1ef72609a2703b3d --- /dev/null +++ b/config/model_config.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1458b56e285bd34b5db29a8e6babc61f9bf02d377a7ce594579baa833190f582 +size 1637 diff --git a/config/tvr_ranking_data_config_top01.json b/config/tvr_ranking_data_config_top01.json new file mode 100644 index 0000000000000000000000000000000000000000..3caca09cd16eb6ac4e7e85d97923e12add2d00d5 --- /dev/null +++ b/config/tvr_ranking_data_config_top01.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ed22c7ab836800651a9ab882496e71d93266bb6dff35c13d308243d1a5c98e +size 926 diff --git a/config/tvr_ranking_data_config_top20.json b/config/tvr_ranking_data_config_top20.json new file mode 100644 index 0000000000000000000000000000000000000000..eb6959266f20ae3b1517c96992f34609b2737761 --- /dev/null +++ b/config/tvr_ranking_data_config_top20.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:509c13907d08921dd59c41b040166b4e0fd6e49260fa79adca9d23f46a804f70 +size 926 diff --git a/config/tvr_ranking_data_config_top40.json b/config/tvr_ranking_data_config_top40.json new file mode 100644 index 0000000000000000000000000000000000000000..f8405cdf6926a3bfae29f14e491eac787b01837c --- /dev/null +++ b/config/tvr_ranking_data_config_top40.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75a6540a46a85534dcf79b5049cc47053cd48232f6983268a584565b4a55d48b +size 926 diff --git a/data_loader/second_stage_start_end_dataset.py b/data_loader/second_stage_start_end_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..39ebae56465c5eec920bd5d99f451243c1452390 --- /dev/null +++ b/data_loader/second_stage_start_end_dataset.py @@ -0,0 +1,349 @@ +import torch +from torch.utils.data import Dataset +import math +import os +import random +import numpy as np +from utils.basic_utils import load_json, l2_normalize_np_array, load_json +import h5py + + +class StartEndDataset(Dataset): + """ + Args: + dset_name, str, ["tvr"] + Return: + a dict: { + "model_inputs": { + "query" + "feat": torch.tensor, (max_desc_len, D_q) + "feat_mask": torch.tensor, (max_desc_len) + "feat_pos_id": torch.tensor, (max_desc_len) + "feat_token_id": torch.tensor, (max_desc_len) + "visual" + "feat": torch.tensor, (max_ctx_len, D_video) + "feat_mask": torch.tensor, (max_ctx_len) + "feat_pos_id": torch.tensor, (max_ctx_len) + "feat_token_id": torch.tensor, (max_ctx_len) + "sub" (optional) + "st_ed_indices": torch.LongTensor, (2, ) + } + } + """ + def __init__(self, config, data_path, vr_rank_path, max_ctx_len=100, max_desc_len=30, clip_length=1.5,ctx_mode="visual_sub", + is_eval = False, mode = "train", + neg_video_num=3, data_ratio=1, + use_extend_pool=500, inference_top_k=10): + + + self.dset_name = config.dset_name + self.root_path = config.root_path + + self.desc_bert_path = os.path.join(self.root_path,config.desc_bert_path) + self.vid_feat_path = os.path.join(self.root_path,config.vid_feat_path) + + self.ctx_mode = ctx_mode + self.use_sub = "sub" in self.ctx_mode + + if self.use_sub: + self.sub_bert_path = os.path.join(self.root_path, config.sub_bert_path) + + self.max_ctx_len = max_ctx_len + self.max_desc_len = max_desc_len + self.clip_length = clip_length + + self.neg_video_num = neg_video_num + self.is_eval = is_eval + + self.mode = mode + if mode in ["val", "test"]: + # = load_json(data_path) + self.annotations = load_json(data_path) + self.ground_truth = self.get_relevant_moment_gt() + self.annotations = self.expand_annotations( self.annotations) + if mode == "train": + self.annotations = self.expand_annotations(load_json(data_path)) + + self.first_VR_ranklist_pool_txn = h5py.File(vr_rank_path, "r") + self.query_bert_h5 = h5py.File(self.desc_bert_path, "r") + self.vid_feat_txn = h5py.File(self.vid_feat_path, "r") + if self.use_sub: + self.sub_bert_txn = h5py.File(self.sub_bert_path, "r") + + + self.inference_top_k = inference_top_k + video_data = load_json(os.path.join(self.root_path,config.video_duration_idx_path)) + + self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] + self.video2idx = {k: v[1] for k, v in video_data.items()} + self.idx2video = {v[1]:k for k, v in video_data.items()} + self.use_extend_pool = use_extend_pool + + self.normalize_vfeat = True + self.normalize_tfeat = False + + self.visual_token_id = 0 + self.text_token_id = 1 + + def __len__(self): + return len(self.annotations) + + def expand_annotations(self, annotations): + new_annotations = [] + for i in annotations: + query = i["query"] + query_id = i["query_id"] + for moment in i["relevant_moment"]: + moment.update({'query': query, 'query_id': query_id}) + new_annotations.append(moment) + return new_annotations + + def get_relevant_moment_gt(self): + gt_all = {} + for data in self.annotations: + gt_all[data["query_id"]] = data["relevant_moment"] + return gt_all + + + def pad_feature(self, feature, max_ctx_len): + """ + Args: + feature: original feature without padding + max_ctx_len: the maximum length of video clips (or query token) + + Returns: + feat_pad : padded feature + feat_mask : feature mask + """ + N_clip, feat_dim = feature.shape + + feat_pad = torch.zeros((max_ctx_len, feat_dim)) + feat_mask = torch.zeros(max_ctx_len, dtype=torch.long) + feat_pad[:N_clip, :] = torch.from_numpy(feature) + feat_mask[:N_clip] = 1 + + return feat_pad , feat_mask + + def get_query_feat_by_query_id(self, query_id, token_id=1): + """ + Args: + query_id: unique query description id + token_id: specify modality embedding + Returns: + a dict for query: { + "feat": torch.tensor, (max_desc_len, D_q) + "feat_mask": torch.tensor, (max_desc_len) + "feat_pos_id": torch.tensor, (max_desc_len) + "feat_token_id": torch.tensor, (max_desc_len) + } + """ + + query_feat = self.query_bert_h5[str(query_id)][:self.max_desc_len] + + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + + feat_pad, feat_mask = \ + self.pad_feature(query_feat, self.max_desc_len) + + temp_model_inputs = dict() + temp_model_inputs["feat"] = feat_pad + temp_model_inputs["feat_mask"] = feat_mask + temp_model_inputs["feat_pos_id"] = torch.arange(self.max_desc_len, dtype=torch.long) + temp_model_inputs["feat_token_id"] = torch.full((self.max_desc_len,), token_id, dtype=torch.long) + + return temp_model_inputs + + def get_visual_feat_from_storage(self,vid_name): + """ + Args: + vid_name: unique video description id + Returns: + visual_feat: torch.tensor, (max_ctx_len, D_v) + Use ResNet + SlowFast , D_v = 2048 + 2304 = 4352 + """ + + visual_feat = self.vid_feat_txn[vid_name][:][:self.max_ctx_len] + + if self.normalize_vfeat: + visual_feat = l2_normalize_np_array(visual_feat) + + return visual_feat + + def get_sub_feat_from_storage(self,vid_name): + """ + Args: + vid_name: unique video description id + Returns: + visual_feat: torch.tensor, (max_ctx_len, D_s) + Use RoBERTa, D_s =768 + """ + + sub_feat = self.sub_bert_txn[vid_name][:][:self.max_ctx_len] + + if self.normalize_tfeat: + sub_feat = l2_normalize_np_array(sub_feat) + + return sub_feat + + def __getitem__(self, index): + + raw_data = self.annotations[index] + # if "video_name" not in raw_data.keys(): + # initialize with basic data + meta = dict( + query_id=raw_data["query_id"], + desc=raw_data["query"], + vid_name=raw_data["video_name"], + ts=raw_data["timestamp"], + ) + + # If mode is test_public, no ground-truth video_id is provided. So use a fixed dummy ground-truth video_id + if self.mode =="test_public": + meta["vid_name"] = "placeholder" + + + model_inputs = dict() + ## query information + model_inputs["query"] = self.get_query_feat_by_query_id(meta["query_id"], + token_id=self.text_token_id) + + query_id = meta["query_id"] + if query_id == 7806: + query_id += 1 + + _external_inference_vr_res = self.first_VR_ranklist_pool_txn[str(query_id)][:] + if not self.is_eval: + ##get the rank location of the ground-truth video for the first VR search engine + location = 100 + for idx, item in enumerate(_external_inference_vr_res): + if meta["vid_name"] == self.idx2video[item[0]]: + location = idx + break + + ##check all the location is below 100 when mode is train + # if self.mode =="train": + # assert 0<=location<100, meta["query_id"] + + ##get the ranklist without the ground-truth video + negative_video_pool_list = [self.idx2video[item[0]] for item in _external_inference_vr_res if meta["vid_name"] != self.idx2video[item[0]] ] + + ##sample neg_video_num negative videos for shared normalization + sampled_negative_video_pool = random.sample(negative_video_pool_list[:location+self.use_extend_pool], + k=self.neg_video_num) + ##the complete sampled video list , [pos, neg1, neg2, ...] + total_vid_name_list = [meta["vid_name"],] + sampled_negative_video_pool + + self.shared_video_num = 1 + self.neg_video_num + + else: + ##during eval, use top-k videos recommended by the first VR search engine + inference_video_list = [ self.idx2video[item[0]] for item in _external_inference_vr_res[:self.inference_top_k]] + inference_video_scores = [ item[1] for item in _external_inference_vr_res[:self.inference_top_k]] + model_inputs["inference_vr_scores"] = torch.FloatTensor(inference_video_scores) + total_vid_name_list = [meta["vid_name"],] + inference_video_list + self.shared_video_num = 1 + self.inference_top_k + + # sampled neg_video_num negative videos or top-k videos + meta["sample_vid_name_list"] = total_vid_name_list[1:] + + """ + a dict for visual modality: { + "feat": torch.tensor, (shared_video_num, max_ctx_len, D_v) + "feat_mask": torch.tensor, (shared_video_num, max_ctx_len) + "feat_pos_id": torch.tensor, (shared_video_num, max_ctx_len) + "feat_token_id": torch.tensor, (shared_video_num, max_ctx_len) + } + """ + groundtruth_visual_feat = self.get_visual_feat_from_storage(meta["vid_name"]) + ctx_l, feat_dim = groundtruth_visual_feat.shape + + visual_feat_pad = torch.zeros((self.shared_video_num, self.max_ctx_len, feat_dim)) + visual_feat_mask = torch.zeros((self.shared_video_num, self.max_ctx_len), dtype=torch.long) + visual_feat_pos_id = \ + torch.repeat_interleave(torch.arange(self.max_ctx_len, dtype=torch.long).unsqueeze(0), + self.shared_video_num, dim=0) + visual_feat_token_id = torch.full((self.shared_video_num, self.max_ctx_len), self.visual_token_id, + dtype=torch.long) + + for index, video_name in enumerate(total_vid_name_list,start=0): + visual_feat = self.get_visual_feat_from_storage(video_name) + + feat_pad, feat_mask = \ + self.pad_feature(visual_feat, self.max_ctx_len) + + visual_feat_pad[index] = feat_pad + visual_feat_mask[index] = feat_mask + + temp_model_inputs = dict() + temp_model_inputs["feat"] = visual_feat_pad + temp_model_inputs["feat_mask"] = visual_feat_mask + temp_model_inputs["feat_pos_id"] = visual_feat_pos_id + temp_model_inputs["feat_token_id"] = visual_feat_token_id + + model_inputs["visual"] = temp_model_inputs + + """ + a dict for sub modality: { + "feat": torch.tensor, (shared_video_num, max_ctx_len, D_t) + "feat_mask": torch.tensor, (shared_video_num, max_ctx_len) + "feat_pos_id": torch.tensor, (shared_video_num, max_ctx_len) + "feat_token_id": torch.tensor, (shared_video_num, max_ctx_len) + } + """ + if self.use_sub: + groundtruth_sub_feat = self.get_sub_feat_from_storage(meta["vid_name"]) + + _ , feat_dim = groundtruth_sub_feat.shape + + sub_feat_pad = torch.zeros((self.shared_video_num, self.max_ctx_len, feat_dim)) + sub_feat_mask = torch.zeros((self.shared_video_num, self.max_ctx_len), dtype=torch.long) + sub_feat_pos_id = \ + torch.repeat_interleave(torch.arange(self.max_ctx_len, dtype=torch.long).unsqueeze(0), + self.shared_video_num, dim=0) + sub_feat_token_id = torch.full((self.shared_video_num, self.max_ctx_len), self.text_token_id, dtype=torch.long) + + for index, video_name in enumerate(total_vid_name_list, start=0): + sub_feat = self.get_sub_feat_from_storage(video_name) + + feat_pad, feat_mask = \ + self.pad_feature(sub_feat, self.max_ctx_len) + + sub_feat_pad[index] = feat_pad + sub_feat_mask[index] = feat_mask + + temp_model_inputs = dict() + temp_model_inputs["feat"] = sub_feat_pad + temp_model_inputs["feat_mask"] = sub_feat_mask + temp_model_inputs["feat_pos_id"] = sub_feat_pos_id + temp_model_inputs["feat_token_id"] = sub_feat_token_id + + model_inputs["sub"] = temp_model_inputs + + if not self.is_eval: + model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], + max_idx=ctx_l - 1) + + return dict(meta=meta, model_inputs=model_inputs) + + def get_st_ed_label(self, ts, max_idx): + """ + Args: + ts: [st (float), ed (float)] in seconds, ed > st + max_idx: length of the video + + Returns: + [st_idx, ed_idx]: int, + ed_idx >= st_idx + st_idx, ed_idx both belong to [0, max_idx-1] + + Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6, + clips should be indexed as [2: 6), the translated back ts should be [3:9]. + # TODO which one is better, [2: 5] or [2: 6) + """ + st_idx = min(math.floor(ts[0] / self.clip_length), max_idx) + ed_idx = min(math.ceil(ts[1] / self.clip_length) - 1, max_idx) # st_idx could be the same as ed_idx + assert 0 <= st_idx <= ed_idx <= max_idx, (ts, st_idx, ed_idx, max_idx) + return torch.LongTensor([st_idx, ed_idx]) + + diff --git a/inference.py b/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..739ef26704661746726b3176c99e114a421f7ac5 --- /dev/null +++ b/inference.py @@ -0,0 +1,570 @@ +import os +import pprint +from tqdm import tqdm +import numpy as np + +import torch +import torch.nn.functional as F +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader + +from config.config import TestOptions +from model.conquer import CONQUER +from data_loader.second_stage_start_end_dataset import StartEndDataset as StartEndEvalDataset +from utils.inference_utils import \ + get_submission_top_n, post_processing_vcmr_nms +from utils.basic_utils import save_json , load_config +from utils.tensor_utils import find_max_triples_from_upper_triangle_product +from standalone_eval.eval import eval_retrieval +from utils.model_utils import move_cuda , start_end_collate +from utils.model_utils import VERY_NEGATIVE_NUMBER +import logging +from time import time +from ndcg_iou_topk import calculate_ndcg_iou + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + +def generate_min_max_length_mask(array_shape, min_l, max_l): + """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked, + below is the case for 4x4. + [[0, 1, 1, 0], + [0, 0, 1, 1], + [0, 0, 0, 1], + [0, 0, 0, 0]] + + Args: + array_shape: np.shape??? The last two dimensions should be the same + min_l: int, minimum length of predicted span + max_l: int, maximum length of predicted span + + Returns: + + """ + single_dims = (1, ) * (len(array_shape) - 2) + mask_shape = single_dims + array_shape[-2:] + extra_length_mask_array = np.ones(mask_shape, dtype=np.float32) # (1, ..., 1, L, L) + mask_triu = np.triu(extra_length_mask_array, k=min_l) + mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l) + final_prob_mask = mask_triu * mask_triu_reversed + return final_prob_mask # with valid bit to be 1 + + +def get_svmr_res_from_st_ed_probs_disjoint(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx, + clip_length, min_pred_l, max_pred_l, max_before_nms): + """ + Args: + svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1] + svmr_gt_ed_probs: + query_metas: + video2idx: + clip_length: float, how long each clip is in seconds + min_pred_l: int, minimum number of clips + max_pred_l: int, maximum number of clips + max_before_nms: get top-max_before_nms predictions for each query + + Returns: + + """ + svmr_res = [] + query_vid_names = [e["vid_name"] for e in query_metas] + + # masking very long ones! Since most are relatively short. + # disjoint : b_i + e_i + _st_ed_scores = np.expand_dims(svmr_gt_st_probs,axis=2) + np.expand_dims(svmr_gt_ed_probs,axis=1) + + _N_q = _st_ed_scores.shape[0] + + _valid_prob_mask = np.logical_not(generate_min_max_length_mask( + _st_ed_scores.shape, min_l=min_pred_l, max_l=max_pred_l).astype(bool)) + + valid_prob_mask = np.tile(_valid_prob_mask,(_N_q, 1, 1)) + + # invalid location will become VERY_NEGATIVE_NUMBER! + _st_ed_scores[valid_prob_mask] = VERY_NEGATIVE_NUMBER + + batched_sorted_triples = find_max_triples_from_upper_triangle_product( + _st_ed_scores, top_n=max_before_nms, prob_thd=None) + for i, q_vid_name in tqdm(enumerate(query_vid_names), + desc="[SVMR] Loop over queries to generate predictions", + total=len(query_vid_names)): # i is query_id + q_m = query_metas[i] + video_idx = video2idx[q_vid_name] + _sorted_triples = batched_sorted_triples[i] + _sorted_triples[:, 1] += 1 # as we redefined ed_idx, which is inside the moment. + _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length + # [video_idx(int), st(float), ed(float), score(float)] + cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()] + cur_query_pred = dict( + query_id=q_m["query_id"], + desc=q_m["desc"], + predictions=cur_ranked_predictions + ) + svmr_res.append(cur_query_pred) + return svmr_res + + +def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx, + clip_length, min_pred_l, max_pred_l, max_before_nms): + """ + Args: + svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1] + svmr_gt_ed_probs: + query_metas: + video2idx: + clip_length: float, how long each clip is in seconds + min_pred_l: int, minimum number of clips + max_pred_l: int, maximum number of clips + max_before_nms: get top-max_before_nms predictions for each query + + Returns: + + """ + svmr_res = [] + query_vid_names = [e["vid_name"] for e in query_metas] + + # masking very long ones! Since most are relatively short. + # general/exclusive : \hat{b_i} * \hat{e_i} + st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs) # (N, L, L) + + valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l) + st_ed_prob_product *= valid_prob_mask # invalid location will become zero! + + batched_sorted_triples = find_max_triples_from_upper_triangle_product( + st_ed_prob_product, top_n=max_before_nms, prob_thd=None) + for i, q_vid_name in tqdm(enumerate(query_vid_names), + desc="[SVMR] Loop over queries to generate predictions", + total=len(query_vid_names)): # i is query_id + q_m = query_metas[i] + video_idx = video2idx[q_vid_name] + _sorted_triples = batched_sorted_triples[i] + _sorted_triples[:, 1] += 1 # as we redefined ed_idx, which is inside the moment. + _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length + # [video_idx(int), st(float), ed(float), score(float)] + cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()] + cur_query_pred = dict( + query_id=q_m["query_id"], + desc=q_m["desc"], + predictions=cur_ranked_predictions + ) + svmr_res.append(cur_query_pred) + return svmr_res + + + +def compute_query2ctx_info(model, eval_dataset, opt, + max_before_nms=200, max_n_videos=100, tasks=("SVMR",)): + """ + Use val set to do evaluation, remember to run with torch.no_grad(). + model : CONQUER + eval_dataset : + opt : + max_before_nms : max moment number before non-maximum suppression + tasks: evaluation tasks + + general/exclusive function : r * \hat{b_i} + \hat{e_i} + """ + is_vr = "VR" in tasks + is_vcmr = "VCMR" in tasks + is_svmr = "SVMR" in tasks + + video2idx = eval_dataset.video2idx + + model.eval() + query_eval_loader = DataLoader(eval_dataset, + collate_fn= start_end_collate, + batch_size=opt.eval_query_bsz, + num_workers=opt.num_workers, + shuffle=False, + pin_memory=True) + + n_total_query = len(eval_dataset) + bsz = opt.eval_query_bsz + + if is_vcmr: + flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int) + flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32) + + if is_vr : + if opt.use_interal_vr_scores: + sorted_q2c_indices = np.tile(np.arange(max_n_videos, dtype=int),n_total_query).reshape(n_total_query,max_n_videos) + sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32) + else: + sorted_q2c_indices = np.empty((n_total_query, max_n_videos), dtype=int) + sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32) + + if is_svmr: + svmr_gt_st_probs = np.zeros((n_total_query, opt.max_ctx_len), dtype=np.float32) + svmr_gt_ed_probs = np.zeros((n_total_query, opt.max_ctx_len), dtype=np.float32) + + query_metas = [] + for idx, batch in tqdm( + enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)): + + _query_metas = batch["meta"] + query_metas.extend(batch["meta"]) + + if opt.device.type == "cuda": + model_inputs = move_cuda(batch["model_inputs"], opt.device) + else: + model_inputs = batch["model_inputs"] + + + video_similarity_score, begin_score_distribution, end_score_distribution = \ + model.get_pred_from_raw_query(model_inputs) + + if is_svmr: + _svmr_st_probs = begin_score_distribution[:, 0] + _svmr_ed_probs = end_score_distribution[:, 0] + + # normalize to get true probabilities!!! + # the probabilities here are already (pad) masked, so only need to do softmax + _svmr_st_probs = F.softmax(_svmr_st_probs, dim=-1) # (_N_q, L) + _svmr_ed_probs = F.softmax(_svmr_ed_probs, dim=-1) + if opt.debug: + print("svmr_st_probs: ", _svmr_st_probs) + + svmr_gt_st_probs[idx * bsz:(idx + 1) * bsz] = \ + _svmr_st_probs.cpu().numpy() + + svmr_gt_ed_probs[idx * bsz:(idx + 1) * bsz] = \ + _svmr_ed_probs.cpu().numpy() + + _vcmr_st_prob = begin_score_distribution[:, 1:] + _vcmr_ed_prob = end_score_distribution[:, 1:] + + if not (is_vr or is_vcmr): + continue + + if opt.use_interal_vr_scores: + bs = begin_score_distribution.size()[0] + _sorted_q2c_indices = torch.arange(max_n_videos).to(begin_score_distribution.device).repeat(bs,1) + _sorted_q2c_scores = model_inputs["inference_vr_scores"] + if is_vr: + sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = model_inputs["inference_vr_scores"].cpu().numpy() + else: + video_similarity_score = video_similarity_score[:, 1:] + _query_context_scores = torch.softmax(video_similarity_score,dim=1) + + # Get top-max_n_videos videos for each query + _sorted_q2c_scores, _sorted_q2c_indices = \ + torch.topk(_query_context_scores, max_n_videos, dim=1, largest=True) + if is_vr: + sorted_q2c_indices[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_indices.cpu().numpy() + sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_scores.cpu().numpy() + + + if not is_vcmr: + continue + + + # normalize to get true probabilities!!! + # the probabilities here are already (pad) masked, so only need to do softmax + _st_probs = F.softmax(_vcmr_st_prob, dim=-1) # (_N_q, N_videos, L) + _ed_probs = F.softmax(_vcmr_ed_prob, dim=-1) + + + # Get VCMR results + # compute combined scores + row_indices = torch.arange(0, len(_st_probs), device=opt.device).unsqueeze(1) + _st_probs = _st_probs[row_indices, _sorted_q2c_indices] # (_N_q, max_n_videos, L) + _ed_probs = _ed_probs[row_indices, _sorted_q2c_indices] + + # (_N_q, max_n_videos, L, L) + # general/exclusive : r * \hat{b_i} * \hat{e_i} + _st_ed_scores = torch.einsum("qvm,qv,qvn->qvmn", _st_probs, _sorted_q2c_scores, _ed_probs) + + valid_prob_mask = generate_min_max_length_mask( + _st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l) + + _st_ed_scores *= torch.from_numpy( + valid_prob_mask).to(_st_ed_scores.device) # invalid location will become zero! + + _n_q = _st_ed_scores.shape[0] + + # sort across the total_n_videos videos (by flatten from the 2nd dim) + # the indices here are local indices, not global indices + + _flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1) # (N_q, total_n_videos*L*L) + _flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \ + torch.sort(_flat_st_ed_scores, dim=1, descending=True) + + # collect data + flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \ + _flat_st_ed_sorted_scores[:, :max_before_nms].detach().cpu().numpy() + flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \ + _flat_st_ed_scores_sorted_indices[:, :max_before_nms].detach().cpu().numpy() + + if opt.debug: + break + + # Numpy starts here!!! + vr_res = [] + if is_vr: + for i, (_sorted_q2c_scores_row, _sorted_q2c_indices_row) in tqdm( + enumerate(zip(sorted_q2c_scores, sorted_q2c_indices)), + desc="[VR] Loop over queries to generate predictions", total=n_total_query): + cur_vr_redictions = [] + query_specific_video_metas = query_metas[i]["sample_vid_name_list"] + for j, (v_score, v_meta_idx) in enumerate(zip(_sorted_q2c_scores_row, _sorted_q2c_indices_row)): + video_idx = video2idx[query_specific_video_metas[v_meta_idx]] + cur_vr_redictions.append([video_idx, 0, 0, float(v_score)]) + cur_query_pred = dict( + query_id=query_metas[i]["query_id"], + desc=query_metas[i]["desc"], + predictions=cur_vr_redictions + ) + vr_res.append(cur_query_pred) + + svmr_res = [] + if is_svmr: + svmr_res = get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, + query_metas, video2idx, + clip_length=opt.clip_length, + min_pred_l=opt.min_pred_l, + max_pred_l=opt.max_pred_l, + max_before_nms=max_before_nms) + + + vcmr_res = [] + if is_vcmr: + for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm( + enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)), + desc="[VCMR] Loop over queries to generate predictions", total=n_total_query): # i is query_idx + # list([video_idx(int), st(float), ed(float), score(float)]) + video_meta_indices_local, pred_st_indices, pred_ed_indices = \ + np.unravel_index(_flat_st_ed_scores_sorted_indices, + shape=(max_n_videos, opt.max_ctx_len, opt.max_ctx_len)) + # video_meta_indices refers to the indices among the total_n_videos + # video_meta_indices_local refers to the indices among the top-max_n_videos + # video_meta_indices refers to the indices in all the videos, which is the True indices + video_meta_indices = sorted_q2c_indices[i, video_meta_indices_local] + + pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length + pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length + cur_vcmr_redictions = [] + query_specific_video_metas = query_metas[i]["sample_vid_name_list"] + for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices, _flat_st_ed_sorted_scores)): # videos + video_idx = video2idx[query_specific_video_metas[v_meta_idx]] + cur_vcmr_redictions.append( + [video_idx, float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j]), float(v_score)]) + + cur_query_pred = dict( + query_id=query_metas[i]["query_id"], + desc=query_metas[i]["desc"], + predictions=cur_vcmr_redictions) + vcmr_res.append(cur_query_pred) + + res = dict(VCMR=vcmr_res, SVMR=svmr_res, VR=vr_res) + return {k: v for k, v in res.items() if len(v) != 0} + + +def compute_query2ctx_info_disjoint(model, eval_dataset, opt, + max_before_nms=200, max_n_videos=100, maxtopk = 40): + """Use val set to do evaluation, remember to run with torch.no_grad(). + model : CONQUER + eval_dataset : + opt : + max_before_nms : max moment number before non-maximum suppression + tasks: evaluation tasks + + disjoint function : b_i + e_i + + """ + video2idx = eval_dataset.video2idx + + model.eval() + query_eval_loader = DataLoader(eval_dataset, collate_fn= start_end_collate, batch_size=opt.eval_query_bsz, + num_workers=opt.num_workers, shuffle=False, pin_memory=True) + + n_total_query = len(eval_dataset) + bsz = opt.eval_query_bsz + + flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int) + flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32) + + + query_metas = [] + for idx, batch in tqdm( + enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)): + + query_metas.extend(batch["meta"]) + if opt.device.type == "cuda": + model_inputs = move_cuda(batch["model_inputs"], opt.device) + + else: + model_inputs = batch["model_inputs"] + + _ , begin_score_distribution, end_score_distribution = model.get_pred_from_raw_query(model_inputs) + + begin_score_distribution = begin_score_distribution[:,1:] + end_score_distribution= end_score_distribution[:,1:] + + # Get VCMR results + # (_N_q, total_n_videos, L, L) + # b_i + e_i + _st_ed_scores = torch.unsqueeze(begin_score_distribution, 3) + torch.unsqueeze(end_score_distribution, 2) + + _n_q, total_n_videos = _st_ed_scores.size()[:2] + + + ## mask the invalid location out of moment length constrain + _valid_prob_mask = np.logical_not(generate_min_max_length_mask( + _st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l).astype(bool)) + + _valid_prob_mask = torch.from_numpy(_valid_prob_mask).to(_st_ed_scores.device) + + valid_prob_mask = _valid_prob_mask.repeat(_n_q,total_n_videos,1,1) + + # invalid location will become VERY_NEGATIVE_NUMBER! + _st_ed_scores[valid_prob_mask] = VERY_NEGATIVE_NUMBER + + # sort across the total_n_videos videos (by flatten from the 2nd dim) + # the indices here are local indices, not global indices + _flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1) # (N_q, total_n_videos*L*L) + _flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \ + torch.sort(_flat_st_ed_scores, dim=1, descending=True) + + # collect data + flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \ + _flat_st_ed_sorted_scores[:, :max_before_nms].detach().cpu().numpy() + flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \ + _flat_st_ed_scores_sorted_indices[:, :max_before_nms].detach().cpu().numpy() + + + + vcmr_res = {} + for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm( + enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)), + desc="[VCMR] Loop over queries to generate predictions", total=n_total_query): # i is query_idx + # list([video_idx(int), st(float), ed(float), score(float)]) + video_meta_indices_local, pred_st_indices, pred_ed_indices = \ + np.unravel_index(_flat_st_ed_scores_sorted_indices, + shape=(total_n_videos, opt.max_ctx_len, opt.max_ctx_len)) + + pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length + pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length + cur_vcmr_redictions = [] + query_specific_video_metas = query_metas[i]["sample_vid_name_list"] + for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices_local, _flat_st_ed_sorted_scores)): # videos + # video_idx = video2idx[query_specific_video_metas[v_meta_idx]] + cur_vcmr_redictions.append( + { + "video_name": query_specific_video_metas[v_meta_idx], + "timestamp": [float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j])], + "model_scores": float(v_score) + } + ) + query_id=query_metas[i]["query_id"] + vcmr_res[query_id] = cur_vcmr_redictions[:maxtopk] + return vcmr_res + +def get_eval_res(model, eval_dataset, opt): + """compute and save query and video proposal embeddings""" + + if opt.similarity_measure == "disjoint": #disjoint b_i+ e_i + eval_res = compute_query2ctx_info_disjoint(model, eval_dataset, opt, + max_before_nms=opt.max_before_nms, + max_n_videos=opt.max_vcmr_video) + elif opt.similarity_measure in ["general" , "exclusive" ] : # r * \hat{b_i} * \hat{e_i} + eval_res = compute_query2ctx_info(model, eval_dataset, opt, + max_before_nms=opt.max_before_nms, + max_n_videos=opt.max_vcmr_video, + tasks=tasks) + + + return eval_res + + +POST_PROCESSING_MMS_FUNC = { + "SVMR": post_processing_vcmr_nms, + "VCMR": post_processing_vcmr_nms +} + +def get_prediction_top_n(list_dict_predictions, top_n): + top_n_res = [] + for e in list_dict_predictions: + e["predictions"] = e["predictions"][:top_n] + top_n_res.append(e) + return top_n_res + + +def eval_epoch(model, eval_dataset, opt, max_after_nms, iou_thds, topks): + + pred_data = get_eval_res(model, eval_dataset, opt) + # video2idx = eval_dataset.video2idx + # pred_data = get_prediction_top_n(eval_res, top_n=max_after_nms) + # pred_data = get_prediction_top_n(eval_res, top_n=max_after_nms) + gt_data = eval_dataset.ground_truth + average_ndcg = calculate_ndcg_iou(gt_data, pred_data, iou_thds, topks) + return average_ndcg, pred_data + + + +def setup_model(opt): + """Load model from checkpoint and move to specified device""" + checkpoint = torch.load(opt.ckpt_filepath) + loaded_model_cfg = checkpoint["model_cfg"] + + model = CONQUER(loaded_model_cfg, + visual_dim=opt.visual_dim, + text_dim=opt.text_dim, + query_dim=opt.query_dim, + hidden_dim=opt.hidden_dim, + video_len=opt.max_ctx_len, + ctx_mode=opt.ctx_mode, + no_output_moe_weight=opt.no_output_moe_weight, + similarity_measure=opt.similarity_measure, + use_debug = opt.debug) + model.load_state_dict(checkpoint["model"]) + + logger.info("Loaded model saved at epoch {} from checkpoint: {}" + .format(checkpoint["epoch"], opt.ckpt_filepath)) + + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + assert len(opt.device_ids) == 1 + # if len(opt.device_ids) > 1: + # logger.info("Use multi GPU", opt.device_ids) + # model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + return model + + +def start_inference(): + logger.info("Setup config, data and model...") + opt = TestOptions().parse() + cudnn.benchmark = False + cudnn.deterministic = True + + data_config = load_config(opt.dataset_config) + + eval_dataset = StartEndEvalDataset( + config = data_config, + max_ctx_len=opt.max_ctx_len, + max_desc_len= opt.max_desc_len, + clip_length = opt.clip_length, + ctx_mode = opt.ctx_mode, + mode = opt.eval_split_name, + data_ratio = opt.data_ratio, + is_eval = True, + inference_top_k = opt.max_vcmr_video) + + postfix = "_hero" + model = setup_model(opt) + save_submission_filename = "inference_{}_{}_{}_predictions_{}{}.json".format( + opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks),postfix) + print(save_submission_filename) + logger.info("Starting inference...") + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=opt.tasks, max_after_nms=100) + logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + +if __name__ == '__main__': + start_inference() diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/backbone/__init__.py b/model/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/backbone/encoder.py b/model/backbone/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f7a946a1184de753ea03309e9afff571a8b582a7 --- /dev/null +++ b/model/backbone/encoder.py @@ -0,0 +1,235 @@ +""" +Pytorch modules +some classes are modified from HuggingFace +(https://github.com/huggingface/transformers) +""" + +import torch +import logging +from torch import nn +logger = logging.getLogger(__name__) + +try: + import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm +except (ImportError, AttributeError) as e: + BertLayerNorm = torch.nn.LayerNorm + +from model.transformer.bert import BertEncoder +from model.layers import (NetVLAD, LinearLayer) +from model.transformer.bert_embed import (BertEmbeddings) +from utils.model_utils import mask_logits +import torch.nn.functional as F + + + +class TransformerBaseModel(nn.Module): + """ + Base Transformer model + """ + def __init__(self, config): + super(TransformerBaseModel, self).__init__() + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + + def forward(self,features,position_ids,token_type_ids,attention_mask): + # embedding layer + embedding_output = self.embeddings(token_type_ids=token_type_ids, + inputs_embeds=features, + position_ids=position_ids) + + encoder_outputs = self.encoder(embedding_output, attention_mask) + + sequence_output = encoder_outputs[0] + + return sequence_output + +class TwoModalEncoder(nn.Module): + """ + Two modality Transformer Encoder model + """ + + def __init__(self, config,img_dim,text_dim,hidden_dim,split_num,output_split=True): + super(TwoModalEncoder, self).__init__() + self.img_linear = LinearLayer( + in_hsz=img_dim, out_hsz=hidden_dim) + self.text_linear = LinearLayer( + in_hsz=text_dim, out_hsz=hidden_dim) + + self.transformer = TransformerBaseModel(config) + self.output_split = output_split + if self.output_split: + self.split_num = split_num + + + def forward(self, visual_features, visual_position_ids, visual_token_type_ids, visual_attention_mask, + text_features,text_position_ids,text_token_type_ids,text_attention_mask): + + transformed_im = self.img_linear(visual_features) + transformed_text = self.text_linear(text_features) + + transformer_input_feat = torch.cat((transformed_im,transformed_text),dim=1) + transformer_input_feat_pos_id = torch.cat((visual_position_ids,text_position_ids),dim=1) + transformer_input_feat_token_id = torch.cat((visual_token_type_ids,text_token_type_ids),dim=1) + transformer_input_feat_mask = torch.cat((visual_attention_mask,text_attention_mask),dim=1) + + output = self.transformer(features=transformer_input_feat, + position_ids=transformer_input_feat_pos_id, + token_type_ids=transformer_input_feat_token_id, + attention_mask=transformer_input_feat_mask) + + if self.output_split: + return torch.split(output,self.split_num,dim=1) + else: + return output + + +class OneModalEncoder(nn.Module): + """ + One modality Transformer Encoder model + """ + + def __init__(self, config,input_dim,hidden_dim): + super(OneModalEncoder, self).__init__() + self.linear = LinearLayer( + in_hsz=input_dim, out_hsz=hidden_dim) + self.transformer = TransformerBaseModel(config) + + def forward(self, features, position_ids, token_type_ids, attention_mask): + + transformed_features = self.linear(features) + + output = self.transformer(features=transformed_features, + position_ids=position_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask) + return output + + +class VideoQueryEncoder(nn.Module): + def __init__(self, config, video_modality, + visual_dim=4352, text_dim= 768, + query_dim=768, hidden_dim = 768,split_num=100,): + super(VideoQueryEncoder, self).__init__() + self.use_sub = len(video_modality) > 1 + if self.use_sub: + self.videoEncoder = TwoModalEncoder(config=config.bert_config, + img_dim = visual_dim, + text_dim = text_dim , + hidden_dim = hidden_dim, + split_num = split_num + ) + else: + self.videoEncoder = OneModalEncoder(config=config.bert_config, + input_dim = visual_dim, + hidden_dim = hidden_dim, + ) + + self.queryEncoder = OneModalEncoder(config=config.query_bert_config, + input_dim= query_dim, + hidden_dim=hidden_dim, + ) + + def forward_repr_query(self, batch): + + query_output = self.queryEncoder( + features=batch["query"]["feat"], + position_ids=batch["query"]["feat_pos_id"], + token_type_ids=batch["query"]["feat_token_id"], + attention_mask=batch["query"]["feat_mask"] + ) + + return query_output + + def forward_repr_video(self,batch): + video_output = dict() + + if len(batch["visual"]["feat"].size()) == 4: + bsz, num_video = batch["visual"]["feat"].size()[:2] + for key in batch.keys(): + if key in ["visual", "sub"]: + for key_2 in batch[key]: + if key_2 in ["feat", "feat_mask", "feat_pos_id", "feat_token_id"]: + shape_list = batch[key][key_2].size()[2:] + batch[key][key_2] = batch[key][key_2].view((bsz * num_video,) + shape_list) + + + if self.use_sub: + video_output["visual"], video_output["sub"] = self.videoEncoder( + visual_features=batch["visual"]["feat"], + visual_position_ids=batch["visual"]["feat_pos_id"], + visual_token_type_ids=batch["visual"]["feat_token_id"], + visual_attention_mask=batch["visual"]["feat_mask"], + text_features=batch["sub"]["feat"], + text_position_ids=batch["sub"]["feat_pos_id"], + text_token_type_ids=batch["sub"]["feat_token_id"], + text_attention_mask=batch["sub"]["feat_mask"] + ) + else: + video_output["visual"] = self.videoEncoder( + features=batch["visual"]["feat"], + position_ids=batch["visual"]["feat_pos_id"], + token_type_ids=batch["visual"]["feat_token_id"], + attention_mask=batch["visual"]["feat_mask"] + ) + + return video_output + + + def forward_repr_both(self, batch): + video_output = self.forward_repr_video(batch) + query_output = self.forward_repr_query(batch) + + return {"video_feat": video_output, + "query_feat": query_output} + + def forward(self,batch,task="repr_both"): + + if task == "repr_both": + return self.forward_repr_both(batch) + elif task == "repr_video": + return self.forward_repr_video(batch) + elif task == "repr_query": + return self.forward_repr_query(batch) + + +class QueryWeightEncoder(nn.Module): + """ + Query Weight Encoder + Using NetVLAD to aggreate contextual query features + Using FC + Softmax to get fusion weights for each modality + """ + def __init__(self, config, video_modality): + super(QueryWeightEncoder, self).__init__() + + ##NetVLAD + self.text_pooling = NetVLAD(feature_size=config.hidden_size,cluster_size=config.text_cluster) + self.moe_txt_dropout = nn.Dropout(config.moe_dropout_prob) + + ##FC + self.moe_fc_txt = nn.Linear( + in_features=self.text_pooling.out_dim, + out_features=len(video_modality), + bias=False) + + self.video_modality = video_modality + + def forward(self, query_feat): + ##NetVLAD + pooled_text = self.text_pooling(query_feat) + pooled_text = self.moe_txt_dropout(pooled_text) + + ##FC + Softmax + moe_weights = self.moe_fc_txt(pooled_text) + softmax_moe_weights = F.softmax(moe_weights, dim=1) + + + moe_weights_dict = dict() + for modality, moe_weight in zip(self.video_modality, torch.split(softmax_moe_weights, 1, dim=1)): + moe_weights_dict[modality] = moe_weight.squeeze(1) + + return moe_weights_dict + + + + diff --git a/model/conquer.py b/model/conquer.py new file mode 100644 index 0000000000000000000000000000000000000000..0139ff8bbac2c7259f056d574f1b0d3fef03eb6e --- /dev/null +++ b/model/conquer.py @@ -0,0 +1,205 @@ +import torch +import torch.nn as nn +from model.backbone.encoder import VideoQueryEncoder, QueryWeightEncoder +from model.qal.query_aware_learning_module import BiDirectionalAttention +from model.layers import FCPlusTransformer#,MomentLocalizationHead +from model.head.ml_head import MomentLocalizationHead +from model.head.vs_head import VideoScoringHead + +import logging +logger = logging.getLogger(__name__) + + +class CONQUER(nn.Module): + def __init__(self, config, + visual_dim = 4352, + text_dim = 768, + query_dim = 768, + hidden_dim = 768, + video_len = 100, + ctx_mode = "visual_sub", + lw_st_ed = 0.01, + lw_video_ce = 0.05, + similarity_measure="general", + use_debug=False, + no_output_moe_weight=False): + + super(CONQUER, self).__init__() + self.config = config + + # related configs + self.lw_st_ed = lw_st_ed + self.lw_video_ce = lw_video_ce + self.similarity_measure = similarity_measure + + self.video_modality = ctx_mode.split("_") + logger.info("video modality : %s" % self.video_modality) + self.output_moe_weight = not no_output_moe_weight + + hidden_dim = hidden_dim + base_bert_layer_config = config.bert_config + + ## Backbone encoder + self.encoder = VideoQueryEncoder(config,video_modality=self.video_modality, + visual_dim=visual_dim,text_dim=text_dim,query_dim=query_dim, + hidden_dim=hidden_dim,split_num=video_len) + + if self.output_moe_weight and len(self.video_modality) > 1: + self.query_weight = QueryWeightEncoder(config.netvlad_config,video_modality=self.video_modality) + + ## Query_aware_feature_learning Module + self.query_aware_feature_learning_layer = BiDirectionalAttention(hidden_dim) + + ## Shared transformer for both moment localization and video scoring heads + self.contextual_QAL_feature_learning = FCPlusTransformer(base_bert_layer_config,hidden_dim * 4) + + ## Moment_localization_head + self.moment_localization_head = MomentLocalizationHead(config.moment_localization_config,base_bert_layer_config,hidden_dim) + self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean") + + ## Optional video_scoring_head + if self.similarity_measure == "exclusive": + self.video_scoring_head = VideoScoringHead(config.video_scoring_config,base_bert_layer_config,hidden_dim) + self.score_ce = nn.CrossEntropyLoss(reduction="mean") + + self.debug_model = use_debug + if self.debug_model: + logger.setLevel(level=logging.DEBUG) + + self.reset_parameters() + + def reset_parameters(self): + """ Initialize the weights.""" + + def re_init(module): + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + #print("nn.Linear, nn.Embedding: ", module) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + module.reset_parameters() + + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + self.apply(re_init) + + + def compute_final_score(self,score_dict,moe_weights=None): + + sample_key = list(score_dict.keys())[0] + final_query_context_scores = torch.zeros_like(score_dict[sample_key]) + shape_size = len(score_dict[sample_key].shape) + if moe_weights is not None: + for mod in self.video_modality: + if shape_size == 2: + final_query_context_scores += torch.einsum("nm,n->nm", score_dict[mod], moe_weights[mod]) + elif shape_size == 3: + final_query_context_scores += torch.einsum("nlm,n->nlm", score_dict[mod], moe_weights[mod]) + else: + for mod in self.video_modality: + final_query_context_scores += torch.div(score_dict[mod], len(self.video_modality)) + + return final_query_context_scores + + + def get_pred_from_raw_query(self, batch): + + ## Extract query and video feature through MMT backbone + _query_feature = self.encoder(batch, task="repr_query") #Widehat_Q + + _video_feature_dict = self.encoder(batch, task="repr_video") #Widehat_V and #Widehat_S + + ## Shared normalization technique + ## Use the same query feature for shared_video_num times + sample_key = list(_video_feature_dict.keys())[0] + query_batch = _query_feature.size()[0] + video_batch, video_len = _video_feature_dict[sample_key].size()[:2] + shared_video_num = int(video_batch / query_batch) + + query_feature = torch.repeat_interleave(_query_feature, shared_video_num, dim=0) + query_mask = torch.repeat_interleave(batch["query"]["feat_mask"], shared_video_num, dim=0) + + + ## Compute Query Dependent Fusion video feature + if self.output_moe_weight and len(self.video_modality) > 1: + moe_weights_dict = self.query_weight(query_feature) + QDF_feature = self.compute_final_score(_video_feature_dict, moe_weights_dict) + else: + QDF_feature = self.compute_final_score(_video_feature_dict,None) + + video_mask = batch["visual"]["feat_mask"] + + + ## Compute Query Aware Learning video feature + QAL_feature = self.query_aware_feature_learning_layer(QDF_feature, query_feature, + video_mask,query_mask) + + ## Contextualize QAL features + Contextual_QAL = self.contextual_QAL_feature_learning( + features=QAL_feature, + feat_mask=video_mask) + + G = torch.cat([QAL_feature,Contextual_QAL], dim=2) + + ## Moment localization head + begin_score_distribution , end_score_distribution = self.moment_localization_head(G,Contextual_QAL,video_mask) + begin_score_distribution = begin_score_distribution.view(query_batch, shared_video_num, video_len) + end_score_distribution = end_score_distribution.view(query_batch, shared_video_num, video_len) + + ## Optional video scoring head + video_similarity_score = None + if self.similarity_measure == "exclusive": + video_similarity_score = self.video_scoring_head(G,video_mask) + video_similarity_score = video_similarity_score.view(query_batch, shared_video_num) + + return video_similarity_score, begin_score_distribution , end_score_distribution + + + def get_moment_loss_share_norm(self, begin_score_distribution, end_score_distribution ,st_ed_indices): + + bs , shared_video_num , video_len = begin_score_distribution.size() + + begin_score_distribution = begin_score_distribution.view(bs,-1) + end_score_distribution = end_score_distribution.view(bs,-1) + + loss_st = self.temporal_criterion(begin_score_distribution, st_ed_indices[:, 0]) + loss_ed = self.temporal_criterion(end_score_distribution, st_ed_indices[:, 1]) + moment_ce_loss = loss_st + loss_ed + + return moment_ce_loss + + + def forward(self,batch): + + video_similarity_score, begin_score_distribution , end_score_distribution = \ + self.get_pred_from_raw_query(batch) + + moment_ce_loss, video_ce_loss = 0, 0 + + # moment cross-entropy loss + # if neg_video_num = 0, we do not sample negative videos + # the softmax operator is performed only for the ground-truth video + # which mean to not use shared normalization training objective + moment_ce_loss = self.get_moment_loss_share_norm( + begin_score_distribution, end_score_distribution, batch["st_ed_indices"]) + moment_ce_loss = self.lw_st_ed * moment_ce_loss + + if self.similarity_measure == "exclusive": + ce_label = batch["st_ed_indices"].new_zeros(video_similarity_score.size()[0]) + video_ce_loss = self.score_ce(video_similarity_score, ce_label) + video_ce_loss = self.lw_video_ce*video_ce_loss + + + loss = moment_ce_loss + video_ce_loss + return loss, {"moment_ce_loss": float(moment_ce_loss), + "video_ce_loss": float(video_ce_loss), + "loss_overall": float(loss)} + + + + diff --git a/model/head/__init__.py b/model/head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/head/ml_head.py b/model/head/ml_head.py new file mode 100644 index 0000000000000000000000000000000000000000..8be34a4ef332e698bc4655999f1dda4f346135f4 --- /dev/null +++ b/model/head/ml_head.py @@ -0,0 +1,61 @@ +import torch +from torch import nn +import logging +logger = logging.getLogger(__name__) + + +from model.layers import FCPlusTransformer, ConvSE + + +class MomentLocalizationHead(nn.Module): + """ + Moment localization head model + """ + + def __init__(self, config,base_bert_layer_config,hidden_dim): + super(MomentLocalizationHead, self).__init__() + + base_bert_layer_config = base_bert_layer_config + hidden_dim = hidden_dim + + self.begin_feature_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 5) + + self.end_feature_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 2) + + self.begin_score_modeling = ConvSE(config) + self.end_score_modeling = ConvSE(config) + + def forward(self, G, Contextual_QAL, video_mask): + """ + Inputs: + :param contextual_qal_features: (batch, feat_size, L_v) + :param video_mask: (batch, L_v) + Return: + score: (begin or end) score distribution + """ + ## OUTPUT LAYER + begin_features = self.begin_feature_modeling( + features=G, + feat_mask=video_mask) + + end_features = self.end_feature_modeling( + features=torch.cat([Contextual_QAL, begin_features], dim=2), + feat_mask=video_mask) + + ## Un-normalized + begin_input_feature = torch.transpose(begin_features, 1, 2) + end_input_feature = torch.transpose(end_features, 1, 2) + + begin_score_distribution = self.begin_score_modeling( + contextual_qal_features=begin_input_feature, + video_mask=video_mask, + ) + + end_score_distribution = self.end_score_modeling( + contextual_qal_features=end_input_feature, + video_mask=video_mask, + ) + + return begin_score_distribution , end_score_distribution + + diff --git a/model/head/vs_head.py b/model/head/vs_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a94c3a4bb5c9f85dd0250a82a7fcb8124775de3d --- /dev/null +++ b/model/head/vs_head.py @@ -0,0 +1,42 @@ +import torch +from torch import nn + +import logging +logger = logging.getLogger(__name__) + +from model.layers import FCPlusTransformer + +class VideoScoringHead(nn.Module): + """ + Video Scoring Head + """ + + def __init__(self, config,base_bert_layer_config,hidden_dim): + super(VideoScoringHead, self).__init__() + + base_bert_layer_config = base_bert_layer_config + hidden_dim = hidden_dim + + + self.video_feature_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 5) + + self.video_score_predictor = nn.Sequential( + nn.Linear(**config.linear_1_cfg), + nn.ReLU(), + nn.Linear(**config.linear_2_cfg) + ) + + + def forward(self, G, video_mask): + + + ## Contexual_QAL_feature for video scoring + R = self.video_feature_modeling( + features=G, + feat_mask=video_mask) + + holistic_video_feature, _ = torch.max(R, dim=1) + + video_similarity_score = self.video_score_predictor(holistic_video_feature.squeeze(1)) # r + + return video_similarity_score \ No newline at end of file diff --git a/model/layers.py b/model/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7f1001996c43ced2fb6f5216cef166299b197447 --- /dev/null +++ b/model/layers.py @@ -0,0 +1,196 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +import logging + +logger = logging.getLogger(__name__) +try: + import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm +except (ImportError, AttributeError) as e: + BertLayerNorm = torch.nn.LayerNorm + +from model.transformer.bert import BertEncoder +from model.modeling_utils import mask_logits + +class LinearLayer(nn.Module): + """linear layer configurable with layer normalization, dropout, ReLU.""" + def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True,tanh=False): + super(LinearLayer, self).__init__() + self.relu = relu + self.tanh = tanh + self.layer_norm = layer_norm + if layer_norm: + self.LayerNorm = BertLayerNorm(in_hsz) + layers = [ + nn.Dropout(dropout), + nn.Linear(in_hsz, out_hsz) + ] + self.net = nn.Sequential(*layers) + + def forward(self, x): + """(N, L, D)""" + if self.layer_norm: + x = self.LayerNorm(x) + x = self.net(x) + if self.relu: + x = F.relu(x, inplace=True) + if self.tanh: + x = torch.tanh(x) + return x # (N, L, D) + + +class NetVLAD(nn.Module): + def __init__(self, cluster_size, feature_size, add_norm=True): + super(NetVLAD, self).__init__() + self.feature_size = feature_size + self.cluster_size = cluster_size + self.clusters = nn.Parameter((1 / math.sqrt(feature_size)) + * torch.randn(feature_size, cluster_size)) + self.clusters2 = nn.Parameter((1 / math.sqrt(feature_size)) + * torch.randn(1, feature_size, cluster_size)) + + self.add_norm = add_norm + self.LayerNorm = BertLayerNorm(cluster_size) + self.out_dim = cluster_size * feature_size + + def forward(self, x): + max_sample = x.size()[1] + x = x.view(-1, self.feature_size) + assignment = torch.matmul(x, self.clusters) + + if self.add_norm: + assignment = self.LayerNorm(assignment) + + assignment = F.softmax(assignment, dim=1) + assignment = assignment.view(-1, max_sample, self.cluster_size) + + a_sum = torch.sum(assignment, -2, keepdim=True) + a = a_sum * self.clusters2 + + assignment = assignment.transpose(1, 2) + + x = x.view(-1, max_sample, self.feature_size) + vlad = torch.matmul(assignment, x) + vlad = vlad.transpose(1, 2) + vlad = vlad - a + + # L2 intra norm + vlad = F.normalize(vlad) + + # flattening + L2 norm + vlad = vlad.reshape(-1, self.cluster_size * self.feature_size) + vlad = F.normalize(vlad) + + return vlad + + +class FCPlusTransformer(nn.Module): + """ + FC + Transformer + FC layer reduces input feature size into hidden size + Transformer contextualizes QAL feature + """ + + def __init__(self, config,input_dim): + super(FCPlusTransformer, self).__init__() + self.trans_linear = LinearLayer( + in_hsz=input_dim, out_hsz=config.hidden_size) + self.encoder = BertEncoder(config) + + def forward(self,features, feat_mask): + """ + Inputs: + :param contextual_qal_features: (batch, L_v, input_dim) + :param feat_mask: (batch, L_v) + Return: + sequence_output: (batch, L_v, hidden_size) + """ + transformed_features = self.trans_linear(features) + + encoder_outputs = self.encoder(transformed_features, feat_mask) + + sequence_output = encoder_outputs[0] + + return sequence_output + + +class ConvSE(nn.Module): + """ + ConvSE module + """ + def __init__(self, config): + super(ConvSE, self).__init__() + + self.clip_score_predictor = nn.Sequential( + nn.Conv1d(**config.conv_cfg_1), + nn.ReLU(), + nn.Conv1d(**config.conv_cfg_2), + ) + + + def forward(self, contextual_qal_features, video_mask): + """ + Inputs: + :param contextual_qal_features: (batch, feat_size, L_v) + :param video_mask: (batch, L_v) + Return: + score: (begin or end) score distribution + """ + score = self.clip_score_predictor(contextual_qal_features).squeeze(1) #(batch, L_v) + + score = mask_logits(score, video_mask) #(batch, L_v) + + return score + + +class MomentLocalizationHead(nn.Module): + """ + Moment localization head model + """ + + def __init__(self, config,base_bert_layer_config,hidden_dim): + super(MomentLocalizationHead, self).__init__() + + base_bert_layer_config = base_bert_layer_config + hidden_dim = hidden_dim + + self.start_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 5) + + self.end_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 2) + + self.start_reader = ConvSE(config) + self.end_reader = ConvSE(config) + + def forward(self, G, Contextual_QAL, video_mask): + """ + Inputs: + :param contextual_qal_features: (batch, feat_size, L_v) + :param video_mask: (batch, L_v) + Return: + score: (begin or end) score distribution + """ + ## OUTPUT LAYER + start_features = self.start_modeling( + features=G, + feat_mask=video_mask) + + end_features = self.end_modeling( + features=torch.cat([Contextual_QAL, start_features], dim=2), + feat_mask=video_mask) + + ## Un-normalized + start_reader_input_feature = torch.transpose(start_features, 1, 2) + end_reader_input_feature = torch.transpose(end_features, 1, 2) + + reader_st_prob = self.start_reader( + contextual_qal_features=start_reader_input_feature, + video_mask=video_mask, + ) + + reader_ed_prob = self.end_reader( + contextual_qal_features=end_reader_input_feature, + video_mask=video_mask, + ) + + return reader_st_prob,reader_ed_prob diff --git a/model/modeling_utils.py b/model/modeling_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..61e3c10d22fc50aa780fe4a9d7d81844e9401b0b --- /dev/null +++ b/model/modeling_utils.py @@ -0,0 +1,135 @@ +""" +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. + +some functions are modified from HuggingFace +(https://github.com/huggingface/transformers) +""" +import torch +from torch import nn +import logging +logger = logging.getLogger(__name__) + + +def prune_linear_layer(layer, index, dim=0): + """ Prune a linear layer (a model parameters) + to keep only entries in index. + Return the pruned layer as a new layer with requires_grad=True. + Used to remove heads. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).clone().detach() + if layer.bias is not None: + if dim == 1: + b = layer.bias.clone().detach() + else: + b = layer.bias[index].clone().detach() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = nn.Linear( + new_size[1], new_size[0], bias=layer.bias is not None).to( + layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + if layer.bias is not None: + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + + +def mask_logits(target, mask, eps=-1e4): + return target * mask + (1 - mask) * eps + + +def load_partial_checkpoint(checkpoint, n_layers, skip_layers=True): + if skip_layers: + new_checkpoint = {} + gap = int(12/n_layers) + prefix = "roberta.encoder.layer." + layer_range = {str(l): str(i) for i, l in enumerate( + list(range(gap-1, 12, gap)))} + for k, v in checkpoint.items(): + if prefix in k: + layer_name = k.split(".") + layer_num = layer_name[3] + if layer_num in layer_range: + layer_name[3] = layer_range[layer_num] + new_layer_name = ".".join(layer_name) + new_checkpoint[new_layer_name] = v + else: + new_checkpoint[k] = v + else: + new_checkpoint = checkpoint + return new_checkpoint + + +def load_pretrained_weight(model, state_dict): + # Load from a PyTorch state_dict + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if 'gamma' in key: + new_key = key.replace('gamma', 'weight') + if 'beta' in key: + new_key = key.replace('beta', 'bias') + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = ({} if metadata is None + else metadata.get(prefix[:-1], {})) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, + unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + start_prefix = '' + if not hasattr(model, 'roberta') and\ + any(s.startswith('roberta.') for s in state_dict.keys()): + start_prefix = 'roberta.' + + load(model, prefix=start_prefix) + if len(missing_keys) > 0: + logger.info("Weights of {} not initialized from " + "pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Weights from pretrained model not used in " + "{}: {}".format( + model.__class__.__name__, unexpected_keys)) + if len(error_msgs) > 0: + raise RuntimeError('Error(s) in loading state_dict for ' + '{}:\n\t{}'.format( + model.__class__.__name__, + "\n\t".join(error_msgs))) + return model + + +def pad_tensor_to_mul(tensor, dim=0, mul=8): + """ pad tensor to multiples (8 for tensor cores) """ + t_size = list(tensor.size()) + n_pad = mul - t_size[dim] % mul + if n_pad == mul: + n_pad = 0 + padded_tensor = tensor + else: + t_size[dim] = n_pad + pad = torch.zeros(*t_size, dtype=tensor.dtype, device=tensor.device) + padded_tensor = torch.cat([tensor, pad], dim=dim) + return padded_tensor, n_pad diff --git a/model/qal/__init__.py b/model/qal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/qal/query_aware_learning_module.py b/model/qal/query_aware_learning_module.py new file mode 100644 index 0000000000000000000000000000000000000000..cd64822b49e5d2c1591c0582fea8f081cea68672 --- /dev/null +++ b/model/qal/query_aware_learning_module.py @@ -0,0 +1,92 @@ +import torch +from torch import nn + +import logging +logger = logging.getLogger(__name__) + +try: + import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm +except (ImportError, AttributeError) as e: + BertLayerNorm = torch.nn.LayerNorm + +from utils.model_utils import mask_logits +import torch.nn.functional as F + + +class BiDirectionalAttention(nn.Module): + """ + Bi-directional attention flow + Perform query-to-video attention (Q2V) and video-to-query attention (V2Q) + Append QDF features with a set of query-aware features to form QAL feature + """ + + def __init__(self, video_dim): + super(BiDirectionalAttention, self).__init__() + ## Core Attention for query-aware feature learining + self.similarity_weight = nn.Linear(video_dim * 3, 1, bias=False) + + + def forward(self, QDF_emb, query_emb,video_mask, query_mask): + """ + Inputs: + :param QDF_emb: (batch, L_v, feat_size) + :param query_emb: (batch, L_q, feat_size) + :param video_mask: (batch, L_v) + :param query_mask: (batch, L_q) + Return: + QAL: (batch, L_v, feat_size*4) + """ + + ## CREATE SIMILARITY MATRIX + video_len = QDF_emb.size()[1] + query_len = query_emb.size()[1] + + _QDF_emb = QDF_emb.unsqueeze(2).repeat(1, 1, query_len, 1) + # [bs, video_len, 1, feat_size] => [bs, video_len, query_len, feat_size] + + _query_emb = query_emb.unsqueeze(1).repeat(1, video_len, 1, 1) + # [bs, 1, query_len, feat_size] => [bs, video_len, query_len, feat_size] + + elementwise_prod = torch.mul(_QDF_emb, _query_emb) + # [bs, video_len, query_len, feat_size] + + alpha = torch.cat([_QDF_emb, _query_emb, elementwise_prod], dim=3) + # [bs, video_len, query_len, feat_size*3] + + similarity_matrix = self.similarity_weight(alpha).view(-1, video_len, query_len) + + similarity_matrix_mask = torch.einsum("bn,bm->bnm", video_mask, query_mask) + # [bs, video_len, query_len] + + ## CALCULATE Video2Query ATTENTION + + a = F.softmax(mask_logits(similarity_matrix, + similarity_matrix_mask), dim=-1) + # [bs, video_len, query_len] + + V2Q = torch.bmm(a, query_emb) + # [bs] ([video_len, query_len] X [query_len, feat_size]) => [bs, video_len, feat_size] + + ## CALCULATE Query2Video ATTENTION + + b = F.softmax(torch.max(mask_logits(similarity_matrix, similarity_matrix_mask), 2)[0], dim=-1) + # [bs, video_len] + + b = b.unsqueeze(1) + # [bs, 1, video_len] + + Q2V = torch.bmm(b, QDF_emb) + # [bs] ([bs, 1, video_len] X [bs, video_len, feat_size]) => [bs, 1, feat_size] + + Q2V = Q2V.repeat(1, video_len, 1) + # [bs, video_len, feat_size] + + ## Append QDF_emb with three query-aware features + + QAL = torch.cat([QDF_emb, V2Q, + torch.mul(QDF_emb, V2Q), + torch.mul(QDF_emb, Q2V)], dim=2) + + # [bs, video_len, feat_size*4] + + return QAL \ No newline at end of file diff --git a/model/transformer/__init__.py b/model/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/model/transformer/bert.py b/model/transformer/bert.py new file mode 100644 index 0000000000000000000000000000000000000000..4d49740bc70c765020c6f7856115b7228a4dd75c --- /dev/null +++ b/model/transformer/bert.py @@ -0,0 +1,275 @@ +""" +BERT/RoBERTa layers from the huggingface implementation +(https://github.com/huggingface/transformers) +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from model.modeling_utils import prune_linear_layer +import math +import logging +logger = logging.getLogger(__name__) +try: + import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm +except (ImportError, AttributeError) as e: + BertLayerNorm = torch.nn.LayerNorm + + +def gelu(x): + """ Original Implementation of the gelu activation function + in Google Bert repo when initialy created. + For information: OpenAI GPT's gelu is slightly different + (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) + * (x + 0.044715 * torch.pow(x, 3)))) + Also see https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def gelu_new(x): + """ Implementation of the gelu activation function currently + in Google Bert repo (identical to OpenAI GPT). + Also see https://arxiv.org/abs/1606.08415 + """ + return 0.5 * x * ( + 1 + torch.tanh( + math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = { + "gelu": gelu, + "relu": torch.nn.functional.relu, + "swish": swish, "gelu_new": gelu_new} + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of " + "the number of attention heads (%d)" % ( + config.hidden_size, config.num_attention_heads)) + self.output_attentions = config.output_attentions + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int( + config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads *\ + self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" + # and "key" to get the raw attention scores. + attention_scores = torch.matmul( + query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is + # (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs)\ + if self.output_attentions else (context_layer,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones( + self.self.num_attention_heads, self.self.attention_head_size) + # Convert to set and emove already pruned heads + heads = set(heads) - self.pruned_heads + for head in heads: + # Compute how many pruned heads are + # before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len( + heads) + self.self.all_head_size =\ + self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, input_tensor, attention_mask=None, head_mask=None): + self_outputs = self.self(input_tensor, attention_mask, head_mask) + attention_output = self.output(self_outputs[0], input_tensor) + # add attentions if we output them + outputs = (attention_output,) + self_outputs[1:] + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + attention_outputs = self.attention( + hidden_states, attention_mask, head_mask) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + # add attentions if we output them + outputs = (layer_output,) + attention_outputs[1:] + return outputs + + +class BertEncoder(nn.Module): + def __init__(self, config): + super(BertEncoder, self).__init__() + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.layer = nn.ModuleList([BertLayer(config) for _ in range( + config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to + # [batch_size, num_heads, from_seq_length, to_seq_length] + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + + all_hidden_states = () + all_attentions = () + for i, layer_module in enumerate(self.layer): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states, extended_attention_mask, None) + hidden_states = layer_outputs[0] + + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + # last-layer hidden state, (all hidden states), (all attentions) + return outputs diff --git a/model/transformer/bert_embed.py b/model/transformer/bert_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..73751ebda8f2342d3159e4fd893c34d03c900d13 --- /dev/null +++ b/model/transformer/bert_embed.py @@ -0,0 +1,64 @@ +""" +Input Embedding Layers +""" +import torch +import torch.nn as nn +import logging + + +logger = logging.getLogger(__name__) +try: + import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm +except (ImportError, AttributeError) as e: + logger.info( + "Better speed can be achieved with apex installed from " + "https://www.github.com/nvidia/apex ." + ) + BertLayerNorm = torch.nn.LayerNorm + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super(BertEmbeddings, self).__init__() + #self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + position_embeddings = self.position_embeddings(position_ids) + + embeddings = inputs_embeds + token_type_embeddings + position_embeddings + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + diff --git a/ndcg_iou_topk.py b/ndcg_iou_topk.py new file mode 100644 index 0000000000000000000000000000000000000000..b35d646c57473d1e3b6eb5550bcb6a1845ebd678 --- /dev/null +++ b/ndcg_iou_topk.py @@ -0,0 +1,66 @@ +from utils.basic_utils import load_jsonl, save_jsonl, load_json +import pandas as pd +from tqdm import tqdm +import numpy as np +from collections import defaultdict +import copy + +def calculate_iou(pred_start: float, pred_end: float, gt_start: float, gt_end: float) -> float: + intersection_start = max(pred_start, gt_start) + intersection_end = min(pred_end, gt_end) + intersection = max(0, intersection_end - intersection_start) + union = (pred_end - pred_start) + (gt_end - gt_start) - intersection + return intersection / union if union > 0 else 0 + + +# Function to calculate DCG +def calculate_dcg(scores): + return sum((2**score - 1) / np.log2(idx + 2) for idx, score in enumerate(scores)) + +# Function to calculate NDCG +def calculate_ndcg(pred_scores, true_scores): + dcg = calculate_dcg(pred_scores) + idcg = calculate_dcg(sorted(true_scores, reverse=True)) + return dcg / idcg if idcg > 0 else 0 + + + +def calculate_ndcg_iou(all_gt, all_pred, TS, KS): + performance = defaultdict(lambda: defaultdict(list)) + performance_avg = defaultdict(lambda: defaultdict(float)) + for k in tqdm(all_pred.keys(), desc="Calculate NDCG"): + one_pred = all_pred[k] + one_gt = all_gt[k] + + one_gt.sort(key=lambda x: x["relevance"], reverse=True) + for T in TS: + one_gt_drop = copy.deepcopy(one_gt) + predictions_with_scores = [] + + for pred in one_pred: + pred_video_name, pred_time = pred["video_name"], pred["timestamp"] + matched_rows = [gt for gt in one_gt_drop if gt["video_name"] == pred_video_name] + if not matched_rows: + pred["pred_relevance"] = 0 + else: + ious = [calculate_iou(pred_time[0], pred_time[1], gt["timestamp"][0], gt["timestamp"][1]) for gt in matched_rows] + max_iou_idx = np.argmax(ious) + max_iou_row = matched_rows[max_iou_idx] + + if ious[max_iou_idx] > T: + pred["pred_relevance"] = max_iou_row["relevance"] + # Remove the matched ground truth row + original_idx = one_gt_drop.index(max_iou_row) + one_gt_drop.pop(original_idx) + else: + pred["pred_relevance"] = 0 + predictions_with_scores.append(pred) + for K in KS: + true_scores = [gt["relevance"] for gt in one_gt][:K] + pred_scores = [pred["pred_relevance"] for pred in predictions_with_scores][:K] + ndcg_score = calculate_ndcg(pred_scores, true_scores) + performance[K][T].append(ndcg_score) + for K, vs in performance.items(): + for T, v in vs.items(): + performance_avg[K][T] = np.mean(v) + return performance_avg diff --git a/optim/adamw.py b/optim/adamw.py new file mode 100644 index 0000000000000000000000000000000000000000..485d0570fd8be0b4d70c5d39e231e8e611e9ee54 --- /dev/null +++ b/optim/adamw.py @@ -0,0 +1,106 @@ +""" +AdamW optimizer (weight decay fix) +originally from hugginface (https://github.com/huggingface/transformers). + +Copied from UNITER +(https://github.com/ChenRocks/UNITER) +""" +import math + +import torch +from torch.optim import Optimizer + + +class AdamW(Optimizer): + """ Implements Adam algorithm with weight decay fix. + Parameters: + lr (float): learning rate. Default 1e-3. + betas (tuple of 2 floats): Adams beta parameters (b1, b2). + Default: (0.9, 0.999) + eps (float): Adams epsilon. Default: 1e-6 + weight_decay (float): Weight decay. Default: 0.0 + correct_bias (bool): can be set to False to avoid correcting bias + in Adam (e.g. like in Bert TF repository). Default True. + """ + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, + weight_decay=0.0, correct_bias=True): + if lr < 0.0: + raise ValueError( + "Invalid learning rate: {} - should be >= 0.0".format(lr)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter: {} - " + "should be in [0.0, 1.0[".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter: {} - " + "should be in [0.0, 1.0[".format(betas[1])) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {} - " + "should be >= 0.0".format(eps)) + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, + correct_bias=correct_bias) + super(AdamW, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError( + 'Adam does not support sparse ' + 'gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + exp_avg.mul_(beta1).add_(grad , alpha=1.0 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) + denom = exp_avg_sq.sqrt().add_(group['eps']) + + step_size = group['lr'] + if group['correct_bias']: # No bias correction for Bert + bias_correction1 = 1.0 - beta1 ** state['step'] + bias_correction2 = 1.0 - beta2 ** state['step'] + step_size = (step_size * math.sqrt(bias_correction2) + / bias_correction1) + + p.data.addcdiv_(exp_avg, denom, value=-step_size) + + # Just adding the square of the weights to the loss function is + # *not* the correct way of using L2 regularization/weight decay + # with Adam, since that will interact with the m and v + # parameters in strange ways. + # + # Instead we want to decay the weights in a manner that doesn't + # interact with the m/v parameters. This is equivalent to + # adding the square of the weights to the loss with plain + # (non-momentum) SGD. + # Add weight decay at the end (fixed version) + if group['weight_decay'] > 0.0: + p.data.add_(p.data, alpha=-group['lr'] * group['weight_decay']) + + return loss diff --git a/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01.log b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01.log new file mode 100644 index 0000000000000000000000000000000000000000..1053dba705d5253f44765cecffddc72e9cede8b4 --- /dev/null +++ b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4d870ccff8ab61b72571cd7c9f84eb916d84fd7f091b2e300dfb9d4be5ee518 +size 29628 diff --git a/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01_back.log b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01_back.log new file mode 100644 index 0000000000000000000000000000000000000000..86854b9f67defe55568d8cf31987a26891f8c952 --- /dev/null +++ b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01_back.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef85a542568c80fab7d57d69041ebd898e30d4fc912082bd4d571aea3ec6424c +size 29917 diff --git a/results/tvr-top01-2024_07_08_17_18_30/best_test_predictions.json b/results/tvr-top01-2024_07_08_17_18_30/best_test_predictions.json new file mode 100644 index 0000000000000000000000000000000000000000..2b6aacaf7ac7175a22af8df0248032f6c0ed6577 --- /dev/null +++ b/results/tvr-top01-2024_07_08_17_18_30/best_test_predictions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0becb2747c635a0080149ccb3e92975f7bf4bf3a99d025fd41d29ae9287db438 +size 14263264 diff --git a/results/tvr-top01-2024_07_08_17_18_30/best_val_predictions.json b/results/tvr-top01-2024_07_08_17_18_30/best_val_predictions.json new file mode 100644 index 0000000000000000000000000000000000000000..dc97f037c6ff1aea91b6954fed138ecd48459920 --- /dev/null +++ b/results/tvr-top01-2024_07_08_17_18_30/best_val_predictions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ced0079b54bdbc05268645d80c6fa52b1ed44c6e04f6922d535be29aa3fd8c +size 2560976 diff --git a/results/tvr-top01-2024_07_08_17_18_30/code.zip b/results/tvr-top01-2024_07_08_17_18_30/code.zip new file mode 100644 index 0000000000000000000000000000000000000000..58dd71a9e4af908ac0eda92c20785c87a430e792 --- /dev/null +++ b/results/tvr-top01-2024_07_08_17_18_30/code.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b0711364459d5340f2e887420295145188a9008d5b50b5ddde46b221645c23 +size 1141392 diff --git a/results/tvr-top01-2024_07_08_17_18_30/model.ckpt b/results/tvr-top01-2024_07_08_17_18_30/model.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..2eb63a9bff37b0aff8d11b4e3a2d3d40b19c17e5 --- /dev/null +++ b/results/tvr-top01-2024_07_08_17_18_30/model.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa2b8044636fe7ce9ab4d36df179ec2358f10a579de4ee5a7e58f338553558d2 +size 190742082 diff --git a/results/tvr-top01-2024_07_08_17_18_30/opt.json b/results/tvr-top01-2024_07_08_17_18_30/opt.json new file mode 100644 index 0000000000000000000000000000000000000000..f659161dbce9315db3afb6f07f436bdc09437da6 --- /dev/null +++ b/results/tvr-top01-2024_07_08_17_18_30/opt.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c93c28739229f5e35afc1239e1f30e0cad28353909eed88b6d65732943a5ac61 +size 1370 diff --git a/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20.log b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20.log new file mode 100644 index 0000000000000000000000000000000000000000..f02f07187a75772c6e33e37b9119a9c2a2de1a9a --- /dev/null +++ b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea621825b2f1d618daf456f872246d6d50bd3729a36606c7cdcf75dcddbec57a +size 30298 diff --git a/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20_back.log b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20_back.log new file mode 100644 index 0000000000000000000000000000000000000000..70e4442166fc357908c393d6e759de13925e62b5 --- /dev/null +++ b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20_back.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b9976e0b0049f434e91251cfcde27b9a2334e95216d995ada4699f83d889c9 +size 31752 diff --git a/results/tvr-top20-2024_07_08_21_19_47/best_test_predictions.json b/results/tvr-top20-2024_07_08_21_19_47/best_test_predictions.json new file mode 100644 index 0000000000000000000000000000000000000000..5639876bebc87d566824ea9638bb335433c70eba --- /dev/null +++ b/results/tvr-top20-2024_07_08_21_19_47/best_test_predictions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12895f4d15d70eff1737745bda045cf6fb1bf6e85aa4e8c4cdd86633cb70274a +size 14324579 diff --git a/results/tvr-top20-2024_07_08_21_19_47/best_val_predictions.json b/results/tvr-top20-2024_07_08_21_19_47/best_val_predictions.json new file mode 100644 index 0000000000000000000000000000000000000000..de0d7b9bb06dc200822c0064147bd717f9fcc4cf --- /dev/null +++ b/results/tvr-top20-2024_07_08_21_19_47/best_val_predictions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103076d328e1b7efdc2773625c38fc73a29492a67bcb27e023af73f8b21c8732 +size 2571786 diff --git a/results/tvr-top20-2024_07_08_21_19_47/code.zip b/results/tvr-top20-2024_07_08_21_19_47/code.zip new file mode 100644 index 0000000000000000000000000000000000000000..58dd71a9e4af908ac0eda92c20785c87a430e792 --- /dev/null +++ b/results/tvr-top20-2024_07_08_21_19_47/code.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b0711364459d5340f2e887420295145188a9008d5b50b5ddde46b221645c23 +size 1141392 diff --git a/results/tvr-top20-2024_07_08_21_19_47/model.ckpt b/results/tvr-top20-2024_07_08_21_19_47/model.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..c9a590306d62773c881a69ba31beb6f6a2e46775 --- /dev/null +++ b/results/tvr-top20-2024_07_08_21_19_47/model.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baff5eaebb7f211640af4e21f2876be344eaa95431ab32398ac7260e9803471f +size 190742082 diff --git a/results/tvr-top20-2024_07_08_21_19_47/opt.json b/results/tvr-top20-2024_07_08_21_19_47/opt.json new file mode 100644 index 0000000000000000000000000000000000000000..0176d847c25f14ba27dfd1451356bb0b6da1d651 --- /dev/null +++ b/results/tvr-top20-2024_07_08_21_19_47/opt.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90d02a58cbb9a5ea0f23e3fefedd3f8f7b8852332b4877cfe7ba2833ca699071 +size 1368 diff --git a/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log new file mode 100644 index 0000000000000000000000000000000000000000..b9f1268610a5c735972e7a48f929a783e0f3e028 --- /dev/null +++ b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:895455a13565da5f3d44126722152288a3057649fef1daa94d7558d490d97d81 +size 24491 diff --git a/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40_back.log b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40_back.log new file mode 100644 index 0000000000000000000000000000000000000000..f7ad9c63e1627cded243fb06478e3b70fb6e27e9 --- /dev/null +++ b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40_back.log @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6085e3055b53b0afc63799813027a70b1d1999beeecf22b0accda3b5a60fe8cc +size 26137 diff --git a/results/tvr-top40-2024_07_11_10_58_46/best_test_predictions.json b/results/tvr-top40-2024_07_11_10_58_46/best_test_predictions.json new file mode 100644 index 0000000000000000000000000000000000000000..d0a434614b7ee800f239c5e6a76019d5c26e4721 --- /dev/null +++ b/results/tvr-top40-2024_07_11_10_58_46/best_test_predictions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5deaab54d6eec95172c5877b38dc72712f76b0357f26e255938a55835627ed2c +size 14329598 diff --git a/results/tvr-top40-2024_07_11_10_58_46/best_val_predictions.json b/results/tvr-top40-2024_07_11_10_58_46/best_val_predictions.json new file mode 100644 index 0000000000000000000000000000000000000000..98fabefdbf27f245fa101dbc8eabe5b50a71b003 --- /dev/null +++ b/results/tvr-top40-2024_07_11_10_58_46/best_val_predictions.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d7b68cde82958c1a7039210d2ac4bb5cfb5083abee6bbb550083395061a8a8 +size 2572649 diff --git a/results/tvr-top40-2024_07_11_10_58_46/code.zip b/results/tvr-top40-2024_07_11_10_58_46/code.zip new file mode 100644 index 0000000000000000000000000000000000000000..54e30461b2f77e125a11b55c9432c386f9d694ad --- /dev/null +++ b/results/tvr-top40-2024_07_11_10_58_46/code.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88e51fa09336f4a4545dc2e281cfe8cea943daf17de87c12b6b75d226fdb61dd +size 1141399 diff --git a/results/tvr-top40-2024_07_11_10_58_46/model.ckpt b/results/tvr-top40-2024_07_11_10_58_46/model.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..972140053e36fd7f042646e5c34b5b2e871e5674 --- /dev/null +++ b/results/tvr-top40-2024_07_11_10_58_46/model.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eba8e53656fed1ddcbb7d8129bd6c72862797c63684f11121a9a78c86b30c70 +size 190742082 diff --git a/results/tvr-top40-2024_07_11_10_58_46/opt.json b/results/tvr-top40-2024_07_11_10_58_46/opt.json new file mode 100644 index 0000000000000000000000000000000000000000..b224f2b0c6d74eb320f9825a406b104cd8e2078f --- /dev/null +++ b/results/tvr-top40-2024_07_11_10_58_46/opt.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e03b5de0524d803c796aaef3fa4aaf1152cfae63644403e236262fe1a4663b3 +size 1368 diff --git a/run_disjoint_top01.sh b/run_disjoint_top01.sh new file mode 100644 index 0000000000000000000000000000000000000000..3d4108989ee2e7b4325c9985aeb4de98721898ee --- /dev/null +++ b/run_disjoint_top01.sh @@ -0,0 +1,19 @@ +python train.py \ + --model_name conquer \ + --dataset_config config/tvr_ranking_data_config_top01.json \ + --model_config config/model_config.json \ + --eval_tasks_at_training VCMR \ + --use_interal_vr_scores \ + --use_extend_pool 500 \ + --neg_video_num 0 \ + --max_vcmr_video 10 \ + --similarity_measure disjoint \ + --bsz 196 \ + --eval_query_bsz 8 \ + --eval_num_per_epoch 0.05 \ + --n_epoch 4000 \ + --exp_id top01 + + # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8 + # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_disjoint_top01.sh + diff --git a/run_disjoint_top20.sh b/run_disjoint_top20.sh new file mode 100644 index 0000000000000000000000000000000000000000..8d86aa0d9761a30974f0f7ccb52b0d0c2cdc0386 --- /dev/null +++ b/run_disjoint_top20.sh @@ -0,0 +1,19 @@ +python train.py \ + --model_name conquer \ + --dataset_config config/tvr_ranking_data_config_top20.json \ + --model_config config/model_config.json \ + --eval_tasks_at_training VCMR \ + --use_interal_vr_scores \ + --use_extend_pool 500 \ + --neg_video_num 0 \ + --max_vcmr_video 10 \ + --similarity_measure disjoint \ + --bsz 196 \ + --eval_query_bsz 8 \ + --eval_num_per_epoch 1 \ + --n_epoch 200 \ + --exp_id top20 + + # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8 + # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_disjoint_top20.sh + diff --git a/run_disjoint_top40.sh b/run_disjoint_top40.sh new file mode 100644 index 0000000000000000000000000000000000000000..067287d8b3d6aa601c9cc7f09a719f3283431668 --- /dev/null +++ b/run_disjoint_top40.sh @@ -0,0 +1,19 @@ +python train.py \ + --model_name conquer \ + --dataset_config config/tvr_ranking_data_config_top40.json \ + --model_config config/model_config.json \ + --eval_tasks_at_training VCMR \ + --use_interal_vr_scores \ + --use_extend_pool 500 \ + --neg_video_num 0 \ + --max_vcmr_video 10 \ + --similarity_measure disjoint \ + --bsz 196 \ + --eval_query_bsz 8 \ + --eval_num_per_epoch 2 \ + --n_epoch 100 \ + --exp_id top40 + + # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8 + # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_disjoint_top40.sh + diff --git a/standalone_eval/__init__.py b/standalone_eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/standalone_eval/eval.py b/standalone_eval/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..164a18e0aa40e7973cade49f4f4dd24fcf497678 --- /dev/null +++ b/standalone_eval/eval.py @@ -0,0 +1,300 @@ +""" +Load prediction file and GT file to calculate TVR metrics: +- recall at top K (R@K), for a specified IoU, where K in [1, 5, 10, 100], IoU in [0.5, 0.7] +""" +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict, defaultdict + + +def load_json(filename): + with open(filename, "r") as f: + return json.load(f) + + +def load_jsonl(filename): + with open(filename, "r") as f: + return [json.loads(l.strip("\n")) for l in f.readlines()] + + +def pad_sequences_1d_np(sequences, dtype=np.float32): + + """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray) + into a (n+1)-d array, only allow the first dim has variable lengths. + Args: + sequences: list(n-d tensor or list) + dtype: np.dtype or torch.dtype + Returns: + padded_seqs: ((n+1)-d tensor) padded with zeros + mask: (2d tensor) of the same shape as the first two dims of padded_seqs, + 1 indicate valid, 0 otherwise + Examples: + >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] + >>> pad_sequences_1d(test_data_list, dtype=np.float32) + >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)] + >>> pad_sequences_1d(test_data_3d, dtype=np.float32) + """ + if isinstance(sequences[0], list): + sequences = [np.asarray(s, dtype=dtype) for s in sequences] + + extra_dims = sequences[0].shape[1:] # the extra dims should be the same for all elements + lengths = [len(seq) for seq in sequences] + assert "numpy" in str(dtype), "dtype and input type does not match" + padded_seqs = np.zeros((len(sequences), max(lengths)) + extra_dims, dtype=dtype) + mask = np.zeros((len(sequences), max(lengths)), dtype=np.float32) + + for idx, seq in enumerate(sequences): + end = lengths[idx] + padded_seqs[idx, :end] = seq + mask[idx, :end] = 1 + return padded_seqs, mask + + +def compute_temporal_iou_batch(preds, gt): + """ compute intersection-over-union along temporal axis + This function is significantly faster than `compute_temporal_iou`, + the result should be the same. + Args: + preds: np.ndarray, (N, 2), [st (float), ed (float)] * N + gt: [st (float), ed (float)] + Returns: + iou (float): np.ndarray, (N, ) + + References: + for np.divide with zeros, see https://stackoverflow.com/a/37977222 + """ + intersection = np.maximum(0, np.minimum(preds[:, 1], gt[1]) - np.maximum(preds[:, 0], gt[0])) + union = np.maximum(preds[:, 1], gt[1]) - np.minimum(preds[:, 0], gt[0]) # not the correct union though + return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0) + + +def get_rounded_percentage(float_number, n_floats=2): + return round(float_number * 100, n_floats) + + +TASK_TYPES = OrderedDict([ + ("VCMR", "Video Corpus Moment Retrieval"), + ("SVMR", "Single Video Moment Retrieval"), + ("VR", "regular Video Retrieval") +]) + + +def eval_by_task_type(moment_predictions, video2idx, ground_truth, + iou_thds=(0.5, 0.7), recall_topks=(1, 5, 10, 100), + task_type="SVMR", max_pred_per_query=100, match_number=True, verbose=True, use_desc_type=True): + """ a predicted triplet is positive only if: + 1) its vid_name matches the GT vid_name + 2) IoU between its timestamp and GT timestamp is higher than the given threshold + + moment_predictions w.r.t. different task_type: + For each query, evaluated on top max_pred_per_query [vid_name, st, ed] triplets. (score entry ignored) + VCMR: vid_name might be repeating. + SVMR: vid_name is fixed to be the GT vid_name. + VR: vid_name is not repeating, st and ed will not be used. + + Args: + video2idx: {vid_name (str): index (int), ...} + moment_predictions: list(dict), each dict is { + "desc": str, + "query_id": int, + "predictions": [vid_name_idx (int), st (float), ed (float), score (float)] * n_pred, + sorted predictions, n_pred could be different for all dicts. For each prediction, + only the first 3 elements [vid_name (str), st (float), ed (float),] are used, + any other following elements are ignored. We leave score here for record. + } + ground_truth: list(dict), each dict is { + "desc": str, + "query_id": int, + "type": str, one of [v, t, vt] + "vid_name": str + "ts": [st (float), ed (float)], or list([st (float), ed (float)]), len == 4. + ... + } + iou_thds: temporal IoU thresholds + recall_topks: recall at different top k + task_type: str, could be: ["VCMR", "SVMR", "VR"], see TASK_TYPES for definition. + max_pred_per_query: int, only top max_pred_per_query predictions for each query are used. + match_number: bool, must set to True if when do evaluation, False is only used for debug. + verbose: + use_desc_type: only TVR has desc type + Returns: + + """ + assert task_type in TASK_TYPES, "task_type must be one of {}".format(list(TASK_TYPES.keys())) + if verbose: + print("Running evaluation with task_type {}, n results {}; n gt {}" + .format(task_type, len(moment_predictions), len(ground_truth))) + + predictions_by_query_id = {e["query_id"]: e for e in moment_predictions} + gt_by_query_id = {e["query_id"]: e for e in ground_truth} + desc_type2idx = {"v": 0, "t": 1, "vt": 2} + desc_types = [] # n_desc + + if match_number: + assert set(gt_by_query_id.keys()) == set(predictions_by_query_id.keys()), \ + "query_ids in predictions and ground_truth must match" + # assert len(set([len(e["predictions"]) for e in predictions_by_query_id.values()])) == 1, \ + # "all queries must have the same number of predictions" + + pred_info_matrix_collection = [] + for k, gt_item in tqdm(gt_by_query_id.items(), desc="Loop over moments", leave=False): + if not match_number and k not in predictions_by_query_id: + continue + pred_info_matrix = np.array( + [e[:3] for e in predictions_by_query_id[k]["predictions"]][:max_pred_per_query], + dtype=np.float32) # (n_pred, 3) + if use_desc_type: + desc_types.append(desc_type2idx[gt_item["type"]]) + vid_name_matched_pred = pred_info_matrix[:, 0] == video2idx[gt_item["vid_name"]] # bool, (n_pred, ) + pred_info_matrix = np.concatenate([pred_info_matrix, vid_name_matched_pred[:, None]], axis=1) # (n_pred, 4) + + # add 1 + len(iou_thds) columns, iou_scores, iou_corrects for each iou_thd. + iou_thd_corrects_columns = [] + if len(gt_item["ts"]) >= 4: # didemo, fro all 3 splits, at least 4 ts for each, < 0.5% has more than 4. + least_n_overlap = 2 # True if overlapped with at least least_n_overlap GT ts. + iou_corrects_dict = defaultdict(list) + for single_gt_ts in gt_item["ts"]: + single_gt_ts = np.array(single_gt_ts, dtype=np.float32) # (2, ) + # iou scores of the predictions that have wrong vid_name are set to 0. + iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred + for iou_thd in iou_thds: + iou_corrects_dict[iou_thd].append(iou_scores >= iou_thd) + for iou_thd in iou_thds: + iou_corrects = sum(iou_corrects_dict[iou_thd]) >= least_n_overlap # bool, (n_pred, ) + iou_thd_corrects_columns.append(iou_corrects[:, None]) + + else: # should be 2, len([st, ed]) == 2 + single_gt_ts = np.array(gt_item["ts"], dtype=np.float32) # (2, ) + # iou scores of the predictions that have wrong vid_name are set to 0. + iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred + + for iou_thd in iou_thds: + iou_corrects = iou_scores >= iou_thd # bool, (n_pred, ) + iou_thd_corrects_columns.append(iou_corrects[:, None]) + + pred_info_matrix = np.concatenate([pred_info_matrix, ] + iou_thd_corrects_columns, axis=1) # (n_pred, 6) + pred_info_matrix_collection.append(pred_info_matrix) + + # column header [vid_name_idx (int), st (float), ed (float), is_vid_name_match (bool), + # iou_scores>=iou_thd0 (bool), iou_scores>=iou_thd1 (bool)] + pred_info_matrix_collection = pad_sequences_1d_np(pred_info_matrix_collection)[0] # (n_desc, n_pred, 6) + if use_desc_type: + desc_types = np.array(desc_types) # (n_desc) + + # results wrapper + metrics = OrderedDict() + metrics_by_type = OrderedDict() + + iou_c_offset = 4 # iou_corrects column index starts here + if task_type == "VCMR": + for iou_idx, iou_thd in enumerate(iou_thds): + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) # (n_desc, n_pred) + # 1) there might be more than one positive clip, so use `>= 1` + for k in recall_topks: + metrics["{}-r{}".format(iou_thd, k)] = \ + get_rounded_percentage(np.mean(np.sum(iou_corrects[:, :k], axis=1) >= 1)) + if use_desc_type: + for desc_type in desc_type2idx: + type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) + n_desc_in_type = np.sum(type_corrects) # (n_desc) + for iou_idx, iou_thd in enumerate(iou_thds): + # (n_desc, n_pred) + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) + for k in recall_topks: + metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage( + 1.0 * np.sum(np.logical_and(np.sum(iou_corrects[:, :k], axis=1) >= 1, type_corrects)) + / n_desc_in_type + ) + elif task_type == "SVMR": + vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool) # (n_desc, n_pred) + n_desc = len(vid_name_matched) + for iou_idx, iou_thd in enumerate(iou_thds): + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) # (n_desc, n_pred) + # 1) there might be more than one positive clip, so use `>= 1` + for k in recall_topks: + metrics["{}-r{}".format(iou_thd, k)] = get_rounded_percentage(np.mean( + [np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 for idx in range(n_desc)] + )) + if use_desc_type: + for desc_type in desc_type2idx: + type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) + n_desc_in_type = np.sum(type_corrects) # (n_desc) + for iou_idx, iou_thd in enumerate(iou_thds): + # (n_desc, n_pred) + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) + # 1) there might be more than one positive clip, so use `>= 1` + for k in recall_topks: + metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage( + 1.0 * np.sum([np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 and type_corrects[idx] + for idx in range(n_desc)]) + / n_desc_in_type) + + elif task_type == "VR": + vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool) # (n_desc, n_pred) + for k in recall_topks: + metrics["r{}".format(k)] = \ + get_rounded_percentage(np.mean(np.sum(vid_name_matched[:, :k], axis=1) >= 1)) + if use_desc_type: + for desc_type in desc_type2idx: + type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) + n_desc_in_type = np.sum(type_corrects) # (n_desc) + for k in recall_topks: + metrics_by_type["{}-r{}".format(desc_type, k)] = get_rounded_percentage( + 1.0 * np.sum(np.logical_and(np.sum(vid_name_matched[:, :k], axis=1) >= 1, type_corrects)) + / n_desc_in_type) + else: + raise ValueError("task_type wrong.") + if use_desc_type: + metrics_by_type["desc_type_ratio"] = "v {} t {} vt {}"\ + .format(*[get_rounded_percentage(1.0 * np.sum(desc_types == desc_type2idx[k]) / len(desc_types)) + for k in ["v", "t", "vt"]]) + return metrics, metrics_by_type + + +def eval_retrieval(submission, ground_truth, iou_thds=(0.5, 0.7), verbose=True, match_number=True, use_desc_type=True): + video2idx = submission["video2idx"] + submitted_task_types = [k for k in TASK_TYPES if k in submission] + if verbose: + print("Evaluating for task {}".format(submitted_task_types)) + eval_metrics = OrderedDict() + metrics_raw_dict = {} + for task_type in submitted_task_types: + metrics, metrics_by_type = eval_by_task_type( + submission[task_type], video2idx, ground_truth, + iou_thds=iou_thds, recall_topks=(1, 5, 10, 100), + task_type=task_type, max_pred_per_query=100, + match_number=match_number, verbose=verbose, use_desc_type=use_desc_type) + metrics_raw_dict[task_type] = metrics + metrics_raw_dict[task_type+"_by_type"] = metrics_by_type + + for task_type in submitted_task_types: + eval_metrics[task_type] = metrics_raw_dict[task_type] + if use_desc_type: + for task_type in submitted_task_types: + eval_metrics[task_type+"_by_type"] = metrics_raw_dict[task_type+"_by_type"] + return eval_metrics + + +def eval_main(): + import argparse + parser = argparse.ArgumentParser(description="TVR Evaluation Script") + parser.add_argument("--submission_path", type=str, help="path to generated prediction file") + parser.add_argument("--gt_path", type=str, help="path to GT file") + parser.add_argument("--save_path", type=str, help="path to save the results") + parser.add_argument("--not_verbose", action="store_true") + args = parser.parse_args() + + verbose = not args.not_verbose + submission = load_json(args.submission_path) + gt = load_jsonl(args.gt_path) + results = eval_retrieval(submission, gt, iou_thds=(0.5, 0.7), verbose=verbose) + if verbose: + print(json.dumps(results, indent=4)) + + with open(args.save_path, "w") as f: + f.write(json.dumps(results, indent=4)) + + +if __name__ == '__main__': + eval_main() diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c6d7a5325ca3e7459c9e220cddac31de951154 --- /dev/null +++ b/train.py @@ -0,0 +1,246 @@ +import os +import time +import json +import pprint +import random +import numpy as np +from tqdm import tqdm, trange +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from config.config import BaseOptions +from model.conquer import CONQUER +from data_loader.second_stage_start_end_dataset import StartEndDataset +from inference import eval_epoch +from optim.adamw import AdamW +from utils.basic_utils import TimeTracker, load_config, save_json, get_logger +from utils.model_utils import count_parameters, move_cuda, start_end_collate + + + +def set_seed(seed, use_cuda=True): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if use_cuda: + torch.cuda.manual_seed_all(seed) + + + +def rm_key_from_odict(odict_obj, rm_suffix): + """remove key entry from the OrderedDict""" + return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k]) + + +def build_optimizer(model, opts): + # Prepare optimizer + param_optimizer = [(n, p) for n, p in model.named_parameters() + if (n.startswith('encoder') or n.startswith('query_weight')) and p.requires_grad ] + + param_top = [(n, p) for n, p in model.named_parameters() + if ( not n.startswith('encoder') and not n.startswith('query_weight')) and p.requires_grad] + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_top + if not any(nd in n for nd in no_decay)], + 'weight_decay': opts.wd}, + {'params': [p for n, p in param_top + if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0}, + {'params': [p for n, p in param_optimizer + if not any(nd in n for nd in no_decay)], + 'lr': opts.lr_mul * opts.lr, + 'weight_decay': opts.wd}, + {'params': [p for n, p in param_optimizer + if any(nd in n for nd in no_decay)], + 'lr': opts.lr_mul * opts.lr, + 'weight_decay': 0.0} + ] + + # currently Adam only + optimizer = AdamW(optimizer_grouped_parameters, + lr=opts.lr) + return optimizer + + +def train(model, train_data, val_data, test_data, opt, logger): + # Prepare optimizer + if opt.device.type == "cuda": + model.to(opt.device) + logger.info("CUDA enabled.") + assert len(opt.device_ids) == 1 + + train_loader = DataLoader(train_data, + collate_fn=start_end_collate, + batch_size=opt.bsz, + num_workers=opt.num_workers, + shuffle=True, + pin_memory=True, + drop_last=True) + + # Prepare optimizer + optimizer = build_optimizer(model, opt) + thresholds = [0.3, 0.5, 0.7] + topks = [10, 20, 40] + best_val_ndcg = 0 + eval_step = len(train_loader) // opt.eval_num_per_epoch + + time_tracker = TimeTracker() + for epoch_i in range(0, opt.n_epoch): + print(f"TRAIN EPOCH: {epoch_i}|{opt.n_epoch}") + + num_training_examples = len(train_loader) + time_tracker.start("grab_data") + + for batch_idx, batch in tqdm(enumerate(train_loader), desc=f"Training {epoch_i}|{opt.n_epoch}", total=num_training_examples): + global_step = epoch_i * num_training_examples + batch_idx + time_tracker.stop("grab_data") + time_tracker.start("to_device") + model.train() + model_inputs = move_cuda(batch["model_inputs"], opt.device) + time_tracker.stop("to_device") + time_tracker.start("forward") + optimizer.zero_grad() + + loss, loss_dict = model(model_inputs) + time_tracker.stop("forward") + time_tracker.start("backward") + + loss.backward() + if opt.grad_clip != -1: + nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) + optimizer.step() + + time_tracker.stop("backward") + time_tracker.start("grab_data") + + if global_step % 10 == 0: + print(time_tracker.report()) + time_tracker.reset_all() + for i in range(torch.cuda.device_count()): + print(f"Memory Allocated on GPU {i}: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB") + print(f"Memory Cached on GPU {i}: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB") + print("-------------------------") + + ###### ------------------- ############# + ### eval during training + if global_step % eval_step == 0 and global_step != 0: + model.eval() + + val_performance, val_predictions = eval_epoch(model, val_data, opt, max_after_nms=40, iou_thds=thresholds, topks=topks) + test_performance, test_predictions = eval_epoch(model, test_data, opt, max_after_nms=40, iou_thds=thresholds, topks=topks) + + logger.info(f"EPOCH: {epoch_i}") + line1 = "" + line2 = "VAL: " + line3 = "TEST: " + for K, vs in val_performance.items(): + for T, v in vs.items(): + line1 += f"NDCG@{K}, IoU={T}\t" + line2 += f" {v:.6f}" + + for K, vs in test_performance.items(): + for T, v in vs.items(): + line3 += f" {v:.6f}" + logger.info(line1) + logger.info(line2) + logger.info(line3) + + anchor_ndcg = val_performance[20][0.5] + if anchor_ndcg > best_val_ndcg: + print("~"*40) + save_json(val_predictions, os.path.join(opt.results_dir, "best_val_predictions.json")) + save_json(test_predictions, os.path.join(opt.results_dir, "best_test_predictions.json")) + best_val_ndcg = anchor_ndcg + logger.info("BEST " + line2) + logger.info("BEST " + line3) + checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + logger.info("save checkpoint: {}".format(opt.ckpt_filepath)) + print("~"*40) + + logger.info("") + + +def start_training(): + opt = BaseOptions().parse() + logger = get_logger(opt.results_dir, opt.model_name +"_"+ opt.exp_id) + set_seed(opt.seed) + opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" + opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" + + + data_config = load_config(opt.dataset_config) + + + train_dataset = StartEndDataset( + config=data_config, + data_path = data_config.train_data_path, + vr_rank_path = data_config.train_first_VR_ranklist_path, + mode="train", + data_ratio=opt.data_ratio, + neg_video_num=opt.neg_video_num, + use_extend_pool=opt.use_extend_pool, + ) + + val_dataset = StartEndDataset( + config = data_config, + data_path = data_config.val_data_path, + vr_rank_path = data_config.val_first_VR_ranklist_path_hero, + mode="val", + max_ctx_len=opt.max_ctx_len, + max_desc_len=opt.max_desc_len, + clip_length=opt.clip_length, + ctx_mode = opt.ctx_mode, + data_ratio = opt.data_ratio, + is_eval = True, + inference_top_k = opt.max_vcmr_video, + ) + + test_dataset = StartEndDataset( + config = data_config, + data_path = data_config.test_data_path, + vr_rank_path = data_config.test_first_VR_ranklist_path_hero, + mode="val", + max_ctx_len=opt.max_ctx_len, + max_desc_len=opt.max_desc_len, + clip_length=opt.clip_length, + ctx_mode = opt.ctx_mode, + data_ratio = opt.data_ratio, + is_eval = True, + inference_top_k = opt.max_vcmr_video, + ) + + + model_config = load_config(opt.model_config) + + logger.info("model_config {}".format(pprint.pformat(model_config,indent=4))) + + model = CONQUER( + model_config, + visual_dim = opt.visual_dim, + text_dim =opt.text_dim, + query_dim = opt.query_dim, + hidden_dim = opt.hidden_dim, + video_len= opt.max_ctx_len, + ctx_mode = opt.ctx_mode, + lw_video_ce = opt.lw_video_ce, # video cross-entropy loss weight + lw_st_ed = opt.lw_st_ed, # moment cross-entropy loss weight + similarity_measure=opt.similarity_measure, + use_debug = opt.debug, + no_output_moe_weight = opt.no_output_moe_weight) + + count_parameters(model) + + logger.info("Start Training...") + train(model, train_dataset, val_dataset, test_dataset, opt, logger) + + +if __name__ == '__main__': + start_training() + diff --git a/unused/convert_h5_lmdb.py b/unused/convert_h5_lmdb.py new file mode 100644 index 0000000000000000000000000000000000000000..56318222b518f945accdce7893357fa2e0050e73 --- /dev/null +++ b/unused/convert_h5_lmdb.py @@ -0,0 +1,27 @@ +import h5py +import lmdb +import numpy as np +from tqdm import tqdm + +h5_path = "data/h5/features/resnet_slowfast_1.5.h5" +lmdb_path = "data/features/resnet_slowfast_1.5" + +h5_data = h5py.File(h5_path, 'r') +env = lmdb.open(lmdb_path, readonly=False, create=True, max_dbs=0, map_size=1 * 1024**3) + +# Open or create the LMDB database +n = 0 +with env.begin(write=True) as txn: + # Iterate over items in the HDF5 file + for key in tqdm(h5_data.keys()): + print(key) + # Read the feature array for the current key + feature = h5_data[key][:] + buffer = np.getbuffer(feature) + txn.put(key.encode(), buffer) + + n += 1 + if n > 10: + break +print("Conversion completed.") + diff --git a/unused/convert_lmdb_h5.py b/unused/convert_lmdb_h5.py new file mode 100644 index 0000000000000000000000000000000000000000..2b5275c2a021962866b7a535f684e5090a49bf47 --- /dev/null +++ b/unused/convert_lmdb_h5.py @@ -0,0 +1,57 @@ +import h5py +import lmdb +import numpy as np +from tqdm import tqdm +import io +import msgpack_numpy +import msgpack + + +lmdb_path = "data/TVR_Ranking_val_top100_hero" +h5_path = "data/h5/TVR_Ranking_val_top100_hero.h5" +# Open the LMDB environment +env = lmdb.open(lmdb_path, readonly=True, max_dbs=0, max_readers=4096 * 8, readahead=False) + +h5_data = h5py.File(h5_path, 'w') +with env.begin(write=False, buffers=True) as txn: + cursor = txn.cursor() + keys = list(cursor.iternext(values=False)) # List of keys for progress tracking + for key in tqdm(keys, desc="Processing LMDB to HDF5"): + key_str = bytes(key).decode() + value = cursor.get(key) + _external_inference_vr_res = msgpack.loads(value) + h5_data.create_dataset(key_str, data=_external_inference_vr_res) +print("Conversion completed.") +h5_data.close() + +# lmdb_path = "data/features/resnet_slowfast_1.5" +# h5_path = "data/h5/features/resnet_slowfast_1.5.h5" +# env = lmdb.open(lmdb_path, readonly=True, max_dbs=0, max_readers=4096 * 8, readahead=False) +# h5_data = h5py.File(h5_path, 'w') +# with env.begin(write=False, buffers=True) as txn: +# cursor = txn.cursor() +# keys = list(cursor.iternext(values=False)) # List of keys for progress tracking +# for key in tqdm(keys, desc="Processing LMDB to HDF5"): +# key_str = bytes(key).decode() +# value = cursor.get(key) +# img_dump = {k: np.copy(v) for k, v in msgpack_numpy.loads(value, raw=False).items()} +# visual_feat = img_dump['features'] # Adjust if needed, like [:self.max_ctx_len] +# h5_data.create_dataset(key_str, data=visual_feat) +# print("Conversion completed.") +# h5_data.close() + + +# lmdb_path = "data/features/tvr_sub_pretrained_w_sub_query_max_cl-1.5" +# h5_path = "data/h5/features/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5" +# env = lmdb.open(lmdb_path, readonly=True, max_dbs=0, max_readers=4096 * 8, readahead=False) +# h5_data = h5py.File(h5_path, 'w') +# with env.begin(write=False, buffers=True) as txn: +# cursor = txn.cursor() +# for key, value in tqdm(cursor): +# key_str = bytes(key).decode() +# with io.BytesIO(value) as reader: +# feat_dump = np.load(reader, allow_pickle=True) +# sub_feat = feat_dump["features"] +# h5_data.create_dataset(key_str, data=sub_feat) +# print("Conversion completed.") +# h5_data.close() diff --git a/unused/find_best_epoch.py b/unused/find_best_epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..d3499a9c0431753c79d1815b947bcb46e444cbe0 --- /dev/null +++ b/unused/find_best_epoch.py @@ -0,0 +1,21 @@ +def rewrite_epoch(filename, new_file_name): + max_value = float(-100) + new_file = [] + + with open(filename, 'r') as file: + for line in file: + new_file.append(line) + if line.startswith("INFO:VAL"): + anchor = float(line.split()[5]) # Assuming the value is at the 5th index + if anchor > max_value: + max_value = anchor + print(max_value) + new_file.append("BEST: " + line) + + with open(new_file_name, 'w') as file: + file.writelines(new_file) + +# Example usage +filename = "results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log" +new_file_name = "new.log" +best_epoch = rewrite_epoch(filename, new_file_name) diff --git a/unused/run_exclusive.sh b/unused/run_exclusive.sh new file mode 100644 index 0000000000000000000000000000000000000000..94466530fe9818ead1069d8a38c1f0abaebb4be8 --- /dev/null +++ b/unused/run_exclusive.sh @@ -0,0 +1,13 @@ +CUDA_VISIBLE_DEVICES=0 python train.py \ + --dataset_config config/tvr_data_config.json \ + --model_config config/model_config.json \ + --eval_tasks_at_training VCMR \ + --use_interal_vr_scores \ + --use_extend_pool 500 \ + --neg_video_num 0 \ + --exp_id debug \ + --max_vcmr_video 10 \ + --similarity_measure exclusive + # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8 + # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_exclusive.sh + diff --git a/unused/run_general.sh b/unused/run_general.sh new file mode 100644 index 0000000000000000000000000000000000000000..464fe5cca75e41afb1e171f4d637d2fa266fa38b --- /dev/null +++ b/unused/run_general.sh @@ -0,0 +1,15 @@ +CUDA_VISIBLE_DEVICES=0 python train.py \ + --dataset_config config/tvrranking_data_config.json \ + --use_interal_vr_scores \ + --model_config config/model_config.json \ + --eval_tasks_at_training VCMR VR \ + --bsz 64 \ + --use_extend_pool 500 \ + --neg_video_num 0 \ + --exp_id reproduce \ + --max_vcmr_video 2 \ + --similarity_measure general \ + --eval_num_per_epoch 1 + # qsub -I -l select=1:ngpus=1 -P gs_slab -q slab_gpu8 + # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_general.sh + diff --git a/unused/run_top01.sh b/unused/run_top01.sh new file mode 100644 index 0000000000000000000000000000000000000000..f5ff950a11cb1d5d2eaddfa08d7f526b38e667c1 --- /dev/null +++ b/unused/run_top01.sh @@ -0,0 +1,27 @@ +# CUDA_VISIBLE_DEVICES=0 \ +python method_tvr/train.py \ + --model_name ReLoCLNet \ + --model_config_path ./configs/ReLoCLNet.yaml \ + --dset_name TVR-Ranking \ + --eval_split_name val \ + --nms_thd -1 \ + --results_root results/tvr_ranking \ + --train_path data/TVR_Ranking/train_top01.jsonl \ + --val_path data/TVR_Ranking/val.jsonl \ + --test_path data/TVR_Ranking/test.jsonl \ + --clip_length 1.5 \ + --vid_feat_size 1024 \ + --ctx_mode video_sub_tef \ + --no_norm_vfeat \ + --max_pred_l 16\ + --sub_feat_size 768\ + --video_duration_idx_path ./data/common_data/video_corpus.json \ + --desc_bert_path ./data/common_data/query_bert.h5 \ + --vid_feat_path /home/share/czzhang/Dataset/TVR/TVR_feature/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 \ + --sub_bert_path /home/share/czzhang/Dataset/TVR/TVR_feature/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5\ + --eval_tasks_at_training VCMR \ + --eval_num_per_epoch 0.1 \ + --n_epoch 1000 \ + --exp_id top01 + # qsub -I -l select=1:ngpus=1 -P gs_slab -q slab_gpu8 + # cd 11_TVR-Ranking/ReLoCLNet/; conda activate py11; sh run_top01.sh \ No newline at end of file diff --git a/unused/run_tvrranking.sh b/unused/run_tvrranking.sh new file mode 100644 index 0000000000000000000000000000000000000000..952ae3f7ff7d13d3b1d343d8871a673fc1ea15f3 --- /dev/null +++ b/unused/run_tvrranking.sh @@ -0,0 +1,15 @@ +CUDA_VISIBLE_DEVICES=0 python train.py \ + --dataset_config config/tvrranking_data_config.json \ + --model_config config/model_config.json \ + --eval_tasks_at_training VCMR \ + --use_interal_vr_scores \ + --bsz 64 \ + --use_extend_pool 500 \ + --neg_video_num 0 \ + --exp_id debug \ + --bsz 5 \ + --max_vcmr_video 10 \ + --num_workers 4 + + # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8 + # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run.sh \ No newline at end of file diff --git a/unused/select_conquer_dataset.py b/unused/select_conquer_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..45369cd87ea1e5d29d1f7af609976f1383de8099 --- /dev/null +++ b/unused/select_conquer_dataset.py @@ -0,0 +1,49 @@ +import h5py +import lmdb +import numpy as np +import msgpack +from utils.basic_utils import load_json, save_json +from tqdm import tqdm +import os + +data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/train_top40.json" +# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/val.json" +# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/test.json" +old_data = load_json(data_path) + +new_data_path = "./data/TVR_Ranking_CONQUER/train_top40.json" +# new_data_path = "./data/TVR_Ranking_CONQUER/val.json" +# new_data_path = "./data/TVR_Ranking_CONQUER/test.json" +new_vr_path = "data/TVR_Ranking_train_top100_hero" +# new_vr_path = "data/TVR_Ranking_val_top100_hero" +# new_vr_path = "data/TVR_Ranking_test_top100_hero" + +# Destination LMDB path (for writing) + +os.makedirs(new_vr_path, exist_ok=True) + +consolidated_path = "/home/renjie.liang/datasets/tvr_feature_release/data/consolidated_vr_results" +vr_pool = lmdb.open(consolidated_path, readonly=True, create=False, max_readers=4096 * 8, readahead=False) +vr_txn = vr_pool.begin(buffers=True) + +# Open the new LMDB for writing +new_vr_pool = lmdb.open(new_vr_path, readonly=False, create=True, max_dbs=0, map_size=10 * 1024**3) # 10 GiB +clean_data = [] +with new_vr_pool.begin(write=True) as new_vr_txn: + for i in tqdm(old_data): + query_id = i["query_id"] + # Retrieve the data from the source database + vr_data = vr_txn.get(str(query_id).encode()) + if vr_data is not None: + clean_data.append(i) + # Data exists, so load it using msgpack and then put it into the new database + vr_res = msgpack.loads(vr_data) + # Ensure the data is serialized before storing + vr_data_serialized = msgpack.dumps(vr_res) + new_vr_txn.put(str(query_id).encode(), vr_data_serialized) + +# Close both the source and destination databases after operations are complete +save_json(clean_data, new_data_path) +print(len(old_data), "->", len(clean_data)) +vr_pool.close() +new_vr_pool.close() \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/basic_utils.py b/utils/basic_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..68666749ee3253d8f14e1ecc8a2e8991d72e941d --- /dev/null +++ b/utils/basic_utils.py @@ -0,0 +1,242 @@ +import os +import json +import zipfile +import numpy as np +import pickle +from easydict import EasyDict + +def load_config(config_json_file) -> EasyDict: + with open(config_json_file, + "r", encoding='utf-8') as reader: + config = json.loads(reader.read()) + cfg = EasyDict(config) + + return cfg + + +def load_pickle(filename): + with open(filename, "rb") as f: + return pickle.load(f) + + +def save_pickle(data, filename): + with open(filename, "wb") as f: + pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) + + +def load_json(filename): + with open(filename, "r") as f: + return json.load(f) + + +def save_json(data, filename, save_pretty=False, sort_keys=False): + with open(filename, "w") as f: + if save_pretty: + f.write(json.dumps(data, indent=4, sort_keys=sort_keys)) + else: + json.dump(data, f) + + +def load_jsonl(filename): + with open(filename, "r") as f: + return [json.loads(l.strip("\n")) for l in f.readlines()] + + +def save_jsonl(data, filename): + """data is a list""" + with open(filename, "w") as f: + f.write("\n".join([json.dumps(e) for e in data])) + + +def save_lines(list_of_str, filepath): + with open(filepath, "w") as f: + f.write("\n".join(list_of_str)) + + +def read_lines(filepath): + with open(filepath, "r") as f: + return [e.strip("\n") for e in f.readlines()] + + +def mkdirp(p): + if not os.path.exists(p): + os.makedirs(p) + + +def flat_list_of_lists(l): + """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]""" + return [item for sublist in l for item in sublist] + + +def convert_to_seconds(hms_time): + """ convert '00:01:12' to 72 seconds. + :hms_time (str): time in comma separated string, e.g. '00:01:12' + :return (int): time in seconds, e.g. 72 + """ + times = [float(t) for t in hms_time.split(":")] + return times[0] * 3600 + times[1] * 60 + times[2] + + +def get_video_name_from_url(url): + return url.split("/")[-1][:-4] + + +def merge_dicts(list_dicts): + merged_dict = list_dicts[0].copy() + for i in range(1, len(list_dicts)): + merged_dict.update(list_dicts[i]) + return merged_dict + + +def l2_normalize_np_array(np_array, eps=1e-5): + """np_array: np.ndarray, (*, D), where the last dim will be normalized""" + return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps) + + +def make_zipfile(src_dir, save_path, enclosing_dir="", exclude_dirs=None, exclude_extensions=None, + exclude_dirs_substring=None): + """make a zip file of root_dir, save it to save_path. + exclude_paths will be excluded if it is a subdir of root_dir. + An enclosing_dir is added is specified. + """ + abs_src = os.path.abspath(src_dir) + with zipfile.ZipFile(save_path, "w") as zf: + for dirname, subdirs, files in os.walk(src_dir): + if exclude_dirs is not None: + for e_p in exclude_dirs: + if e_p in subdirs: + subdirs.remove(e_p) + if exclude_dirs_substring is not None: + to_rm = [] + for d in subdirs: + if exclude_dirs_substring in d: + to_rm.append(d) + for e in to_rm: + subdirs.remove(e) + arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:]) + zf.write(dirname, arcname) + for filename in files: + if exclude_extensions is not None: + if os.path.splitext(filename)[1] in exclude_extensions: + continue # do not zip it + absname = os.path.join(dirname, filename) + arcname = os.path.join(enclosing_dir, absname[len(abs_src) + 1:]) + zf.write(absname, arcname) + + +class AverageMeter(object): + """Computes and stores the average and current/max/min value""" + def __init__(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + self.max = -1e10 + self.min = 1e10 + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + self.max = -1e10 + self.min = 1e10 + + def update(self, val, n=1): + self.max = max(val, self.max) + self.min = min(val, self.min) + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True): + """Dissect an array (N, D) into a list a sub-array, + np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept""" + if assert_equal: + assert len(np_array) == sum(lengths) + length_indices = [0, ] + for i in range(len(lengths)): + length_indices.append(length_indices[i] + lengths[i]) + if dim == 0: + array_list = [np_array[length_indices[i]:length_indices[i+1]] for i in range(len(lengths))] + elif dim == 1: + array_list = [np_array[:, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))] + elif dim == 2: + array_list = [np_array[:, :, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))] + else: + raise NotImplementedError + return array_list + + +def get_ratio_from_counter(counter_obj, threshold=200): + keys = counter_obj.keys() + values = counter_obj.values() + filtered_values = [counter_obj[k] for k in keys if k > threshold] + return float(sum(filtered_values)) / sum(values) + + + + +import time +import logging +import os + +def get_logger(dir, tile): + os.makedirs(dir, exist_ok=True) + log_file = time.strftime("%Y%m%d_%H%M%S", time.localtime()) + log_file = os.path.join(dir, "{}_{}.log".format(log_file, tile)) + + logger = logging.getLogger() + logger.setLevel('DEBUG') + BASIC_FORMAT = "%(levelname)s:%(message)s" + # DATE_FORMAT = '%Y-%m-%d %H:%M:%S' + formatter = logging.Formatter(BASIC_FORMAT) + chlr = logging.StreamHandler() + chlr.setFormatter(formatter) + + fhlr = logging.FileHandler(log_file) + fhlr.setFormatter(formatter) + fhlr.setLevel('INFO') + + logger.addHandler(chlr) + logger.addHandler(fhlr) + return logger + + + + + + + +class TimeTracker: + def __init__(self): + self.times = {} + self.start_times = {} + + def start(self, name): + self.start_times[name] = time.time() + + def stop(self, name): + if name not in self.times: + self.times[name] = 0 + if name in self.start_times: + self.times[name] += time.time() - self.start_times[name] + del self.start_times[name] + + def get_time(self, name): + return self.times.get(name, 0) + + def reset(self, name): + if name in self.times: + self.times[name] = 0 + + def reset_all(self): + self.times = {} + self.start_times = {} + + def report(self): + report = "\n".join([f"{name}: {time:.4f} seconds" for name, time in self.times.items()]) + return report diff --git a/utils/inference_utils.py b/utils/inference_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2b80d8f1ff3bf9148ec30f037ff1c416704dc822 --- /dev/null +++ b/utils/inference_utils.py @@ -0,0 +1,76 @@ +from utils.temporal_nms import temporal_non_maximum_suppression +from collections import defaultdict + + +def get_submission_top_n(submission, top_n=100): + def get_prediction_top_n(list_dict_predictions, top_n): + top_n_res = [] + for e in list_dict_predictions: + e["predictions"] = e["predictions"][:top_n] + top_n_res.append(e) + return top_n_res + + top_n_submission = dict(video2idx=submission["video2idx"], ) + for k in submission: + if k != "video2idx": + top_n_submission[k] = get_prediction_top_n(submission[k], top_n) + return top_n_submission + + + +def post_processing_vcmr_nms(vcmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100): + """ + vcmr_res: list(dict), each dict is{ + "desc": str, + "query_id": int, + "predictions": list(sublist) # each sublist is + [video_idx (int), st (float), ed(float), score (float)], video_idx could be different + } + """ + processed_vcmr_res = [] + for e in vcmr_res: + e["predictions"] = filter_vcmr_by_nms(e["predictions"], + nms_threshold=nms_thd, + max_before_nms=max_before_nms, + max_after_nms=max_after_nms) + processed_vcmr_res.append(e) + return processed_vcmr_res + + +def filter_vcmr_by_nms(all_video_predictions, nms_threshold=0.6, + max_before_nms=1000, max_after_nms=100, score_col_idx=3): + """ Apply non-maximum suppression for all the predictions for each video. + 1) group predictions by video index + 2) apply nms individually for each video index group + 3) combine and sort the predictions + Args: + all_video_predictions: list(sublist), + Each sublist is [video_idx (int), st (float), ed(float), score (float)] + Note the scores are negative distances. + nms_threshold: float + max_before_nms: int + max_after_nms: int + score_col_idx: int + Returns: + + """ + predictions_neg_by_video_group = defaultdict(list) + for pred in all_video_predictions[:max_before_nms]: + predictions_neg_by_video_group[pred[0]].append(pred[1:]) # [st (float), ed(float), score (float)] + + predictions_by_video_group_neg_after_nms = dict() + for video_idx, grouped_preds in predictions_neg_by_video_group.items(): + predictions_by_video_group_neg_after_nms[video_idx] = \ + temporal_non_maximum_suppression(grouped_preds, nms_threshold=nms_threshold) + + predictions_after_nms = [] + for video_idx, grouped_preds in predictions_by_video_group_neg_after_nms.items(): + for pred in grouped_preds: + pred = [video_idx] + pred # [video_idx (int), st (float), ed(float), score (float)] + predictions_after_nms.append(pred) + + # ranking happens across videos + predictions_after_nms = sorted(predictions_after_nms, + key=lambda x: x[score_col_idx], + reverse=True)[:max_after_nms] # descending order + return predictions_after_nms \ No newline at end of file diff --git a/utils/mk_video_split_with_duration.py b/utils/mk_video_split_with_duration.py new file mode 100644 index 0000000000000000000000000000000000000000..ab5a524174febeb4515e511dc33c10a74c212d84 --- /dev/null +++ b/utils/mk_video_split_with_duration.py @@ -0,0 +1,18 @@ +from utils.basic_utils import load_json, save_json + + +def combine(video_name_split_path, video_duration_path, save_path): + video_name_split = load_json(video_name_split_path) + video_duration_dict = load_json(video_duration_path) + + combined_dict = {} + for split_name, split_video_names in video_name_split.items(): + combined_dict[split_name] = {vid_name: video_duration_dict[vid_name] + for vid_name in split_video_names} + save_json(combined_dict, save_path) + + +if __name__ == '__main__': + import sys + combine(*sys.argv[1:]) + diff --git a/utils/model_utils.py b/utils/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d2dead4ace10a865bd310eee3b1cbe953d3b3ddd --- /dev/null +++ b/utils/model_utils.py @@ -0,0 +1,68 @@ +__author__ = "Jie Lei" + +# ref: https://github.com/lichengunc/MAttNet/blob/master/lib/layers/lang_encoder.py#L11 +# ref: https://github.com/easonnie/flint/blob/master/torch_util.py#L272 +import torch +from torch.utils.data.dataloader import default_collate + +VERY_NEGATIVE_NUMBER = -1e10 +VERY_POSITIVE_NUMBER = 1e10 + +def count_parameters(model, verbose=True): + """Count number of parameters in PyTorch model, + References: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7. + + from utils.utils import count_parameters + count_parameters(model) + import sys + sys.exit(1) + """ + n_all = sum(p.numel() for p in model.parameters()) + n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + if verbose: + print("Parameter Count: all {:,d}; trainable {:,d}".format(n_all, n_trainable)) + return n_all, n_trainable + +def mask_logits(target, mask): + return target * mask + (1 - mask) * VERY_NEGATIVE_NUMBER + +def move_cuda(batch,device): + # move to cuda + for key, value in batch.items(): + if isinstance(value, dict): + for _key, _value in value.items(): + batch[key][_key] = _value.cuda(non_blocking=True, device=device) + elif isinstance(value, (list,)): + for i in range(len(value)): + batch[key][i] = value[i].cuda(non_blocking=True, device=device) + else: + batch[key] = value.cuda(non_blocking=True, device=device) + + return batch + +def start_end_collate(batch): + batch_meta = [e["meta"] for e in batch] # no need to collate + + batched_data = default_collate([e["model_inputs"] for e in batch]) + return {"meta":batch_meta, "model_inputs":batched_data} + + +# def vsmr_start_end_collate(batch): +# batch_meta = [e["meta"] for e in batch] # no need to collate +# +# batched_data = dict() +# sample_batch_data = batch[0]["model_inputs"] +# +# for key in ["visual", "sub"]: +# if key in sample_batch_data.keys(): +# batched_data[key] = dict() +# for key_2 in ["feat","feat_mask","feat_pos_id","feat_token_id"]: +# batched_data[key][key_2] = torch.cat(tuple(e["model_inputs"][key][key_2] for e in batch),dim=0) +# +# +# for key in ["query", "st_ed_indices" ]: +# if key in sample_batch_data.keys(): +# batched_data[key] = default_collate([e["model_inputs"][key] for e in batch]) +# +# return {"meta":batch_meta, "model_inputs":batched_data} + diff --git a/utils/temporal_nms.py b/utils/temporal_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..545ed8045d7da4a6a831395029e39c0f803025d5 --- /dev/null +++ b/utils/temporal_nms.py @@ -0,0 +1,74 @@ +""" +Non-Maximum Suppression for video proposals. +""" + + +def compute_temporal_iou(pred, gt): + """ deprecated due to performance concerns + compute intersection-over-union along temporal axis + Args: + pred: [st (float), ed (float)] + gt: [st (float), ed (float)] + Returns: + iou (float): + + Ref: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py + """ + intersection = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0])) + union = max(pred[1], gt[1]) - min(pred[0], gt[0]) # not the correct union though + if union == 0: + return 0 + else: + return 1.0 * intersection / union + + +def temporal_non_maximum_suppression(predictions, nms_threshold, max_after_nms=100): + """ + Args: + predictions: list(sublist), each sublist is [st (float), ed(float), score (float)], + note larger scores are better and are preserved. For metrics that are better when smaller, + please convert to its negative, e.g., convert distance to negative distance. + nms_threshold: float in [0, 1] + max_after_nms: + Returns: + predictions_after_nms: list(sublist), each sublist is [st (float), ed(float), score (float)] + References: + https://github.com/wzmsltw/BSN-boundary-sensitive-network/blob/7b101fc5978802aa3c95ba5779eb54151c6173c6/Post_processing.py#L42 + """ + if len(predictions) == 1: # only has one prediction, no need for nms + return predictions + + predictions = sorted(predictions, key=lambda x: x[2], reverse=True) # descending order + + tstart = [e[0] for e in predictions] + tend = [e[1] for e in predictions] + tscore = [e[2] for e in predictions] + rstart = [] + rend = [] + rscore = [] + while len(tstart) > 1 and len(rscore) < max_after_nms: # max 100 after nms + idx = 1 + while idx < len(tstart): # compare with every prediction in the list. + if compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]) > nms_threshold: + # rm highly overlapped lower score entries. + tstart.pop(idx) + tend.pop(idx) + tscore.pop(idx) + # print("--------------------------------") + # print(compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]])) + # print([tstart[0], tend[0]], [tstart[idx], tend[idx]]) + # print(tstart.pop(idx), tend.pop(idx), tscore.pop(idx)) + else: + # move to next + idx += 1 + rstart.append(tstart.pop(0)) + rend.append(tend.pop(0)) + rscore.append(tscore.pop(0)) + + if len(rscore) < max_after_nms and len(tstart) >= 1: # add the last, possibly empty. + rstart.append(tstart.pop(0)) + rend.append(tend.pop(0)) + rscore.append(tscore.pop(0)) + + predictions_after_nms = [[st, ed, s] for s, st, ed in zip(rscore, rstart, rend)] + return predictions_after_nms diff --git a/utils/tensor_utils.py b/utils/tensor_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..72497127fdbbd935bfc8c42b5fae723db04d73f8 --- /dev/null +++ b/utils/tensor_utils.py @@ -0,0 +1,141 @@ +import numpy as np +import torch + + +def pad_sequences_1d(sequences, dtype=torch.long, device=torch.device("cpu"), fixed_length=None): + """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray) + into a (n+1)-d array, only allow the first dim has variable lengths. + Args: + sequences: list(n-d tensor or list) + dtype: np.dtype or torch.dtype + device: + fixed_length: pad all seq in sequences to fixed length. All seq should have a length <= fixed_length. + return will be of shape [len(sequences), fixed_length, ...] + Returns: + padded_seqs: ((n+1)-d tensor) padded with zeros + mask: (2d tensor) of the same shape as the first two dims of padded_seqs, + 1 indicate valid, 0 otherwise + Examples: + >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] + >>> pad_sequences_1d(test_data_list, dtype=torch.long) + >>> test_data_3d = [torch.randn(2,3,4), torch.randn(4,3,4), torch.randn(1,3,4)] + >>> pad_sequences_1d(test_data_3d, dtype=torch.float) + >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] + >>> pad_sequences_1d(test_data_list, dtype=np.float32) + >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)] + >>> pad_sequences_1d(test_data_3d, dtype=np.float32) + """ + if isinstance(sequences[0], list): + if "torch" in str(dtype): + sequences = [torch.tensor(s, dtype=dtype, device=device) for s in sequences] + else: + sequences = [np.asarray(s, dtype=dtype) for s in sequences] + + extra_dims = sequences[0].shape[1:] # the extra dims should be the same for all elements + lengths = [len(seq) for seq in sequences] + if fixed_length is not None: + max_length = fixed_length + else: + max_length = max(lengths) + if isinstance(sequences[0], torch.Tensor): + assert "torch" in str(dtype), "dtype and input type does not match" + padded_seqs = torch.zeros((len(sequences), max_length) + extra_dims, dtype=dtype, device=device) + mask = torch.zeros((len(sequences), max_length), dtype=torch.float32, device=device) + else: # np + assert "numpy" in str(dtype), "dtype and input type does not match" + padded_seqs = np.zeros((len(sequences), max_length) + extra_dims, dtype=dtype) + mask = np.zeros((len(sequences), max_length), dtype=np.float32) + + for idx, seq in enumerate(sequences): + end = lengths[idx] + padded_seqs[idx, :end] = seq + mask[idx, :end] = 1 + return padded_seqs, mask # , lengths + + +def pad_sequences_2d(sequences, dtype=torch.long): + """ Pad a double-nested list or a sequence of n-d torch tensor into a (n+1)-d tensor, + only allow the first two dims has variable lengths + Args: + sequences: list(n-d tensor or list) + dtype: torch.long for word indices / torch.float (float32) for other cases + Returns: + Examples: + >>> test_data_list = [[[1, 3, 5], [3, 7, 4, 1]], [[98, 34, 11, 89, 90], [22], [34, 56]],] + >>> pad_sequences_2d(test_data_list, dtype=torch.long) # torch.Size([2, 3, 5]) + >>> test_data_3d = [torch.randn(2,2,4), torch.randn(4,3,4), torch.randn(1,5,4)] + >>> pad_sequences_2d(test_data_3d, dtype=torch.float) # torch.Size([2, 3, 5]) + >>> test_data_3d2 = [[torch.randn(2,4), ], [torch.randn(3,4), torch.randn(5,4)]] + >>> pad_sequences_2d(test_data_3d2, dtype=torch.float) # torch.Size([2, 3, 5]) + # TODO add support for numpy array + """ + bsz = len(sequences) + para_lengths = [len(seq) for seq in sequences] + max_para_len = max(para_lengths) + sen_lengths = [[len(word_seq) for word_seq in seq] for seq in sequences] + max_sen_len = max([max(e) for e in sen_lengths]) + + if isinstance(sequences[0], torch.Tensor): + extra_dims = sequences[0].shape[2:] + elif isinstance(sequences[0][0], torch.Tensor): + extra_dims = sequences[0][0].shape[1:] + else: + sequences = [[torch.Tensor(word_seq, dtype=dtype) for word_seq in seq] for seq in sequences] + extra_dims = () + + padded_seqs = torch.zeros((bsz, max_para_len, max_sen_len) + extra_dims, dtype=dtype) + mask = torch.zeros(bsz, max_para_len, max_sen_len).float() + + for b_i in range(bsz): + for sen_i, sen_l in enumerate(sen_lengths[b_i]): + padded_seqs[b_i, sen_i, :sen_l] = sequences[b_i][sen_i] + mask[b_i, sen_i, :sen_l] = 1 + return padded_seqs, mask # , sen_lengths + + +def find_max_triples(st_prob, ed_prob, top_n=5, prob_thd=None, tensor_type="torch"): + """ Find a list of (k1, k2) where k1 < k2 with the maximum values of st_prob[k1] * ed_prob[k2] + Args: + st_prob (torch.Tensor or np.ndarray): (N, L) batched start_idx probabilities + ed_prob (torch.Tensor or np.ndarray): (N, L) batched end_idx probabilities + top_n (int): return topN pairs with highest values + prob_thd (float): + tensor_type: str, np or torch + Returns: + batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...] + """ + if tensor_type == "torch": + st_prob, ed_prob = st_prob.data.numpy(), ed_prob.data.numpy() + product = np.einsum("bm,bn->bmn", st_prob, ed_prob) + # (N, L, L) the lower part becomes zeros, start_idx < ed_idx + upper_product = np.triu(product, k=1) + return find_max_triples_from_upper_triangle_product(upper_product, top_n=top_n, prob_thd=prob_thd) + + +def find_max_triples_from_upper_triangle_product(upper_product, top_n=5, prob_thd=None): + """ Find a list of (k1, k2) where k1 < k2 with the maximum values of p1[k1] * p2[k2] + Args: + upper_product (torch.Tensor or np.ndarray): (N, L, L), the lower part becomes zeros, end_idx > start_idx + top_n (int): return topN pairs with highest values + prob_thd (float or None): + Returns: + batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...] + """ + batched_sorted_triple = [] + for idx, e in enumerate(upper_product): + sorted_triple = top_n_array_2d(e, top_n=top_n) + if prob_thd is not None: + sorted_triple = sorted_triple[sorted_triple[2] >= prob_thd] + batched_sorted_triple.append(sorted_triple) + return batched_sorted_triple + + +def top_n_array_2d(array_2d, top_n): + """ Get topN indices and values of a 2d array, return a tuple of indices and their values, + ranked by the value + """ + row_indices, column_indices = np.unravel_index(np.argsort(array_2d, axis=None), array_2d.shape) + row_indices = row_indices[::-1][:top_n] + column_indices = column_indices[::-1][:top_n] + sorted_values = array_2d[row_indices, column_indices] + return np.stack([row_indices, column_indices, sorted_values], axis=1) # (N, 3)