diff --git a/baselines/__init__.py b/baselines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/baselines/__pycache__/__init__.cpython-311.pyc b/baselines/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54c099d3ef7936fc280d7487c04591f749de52e1 Binary files /dev/null and b/baselines/__pycache__/__init__.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/README.md b/baselines/clip_alignment_with_language/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8629394429e1dee1b3dafe346a49569885f851c8 --- /dev/null +++ b/baselines/clip_alignment_with_language/README.md @@ -0,0 +1,25 @@ +# Clip Alignment With Language +This folder contains the CAL model described in the paper +``` +@article{Escorcia2019TemporalLO, + title={Temporal Localization of Moments in Video Collections with Natural Language}, + author={Victor Escorcia and Mattia Soldan and Josef Sivic and Bernard Ghanem and Bryan Russell}, + journal={ArXiv}, + year={2019}, + volume={abs/1907.12763} +} +``` + +It also resembles the MCN model in +``` +@article{Hendricks2017LocalizingMI, + title={Localizing Moments in Video with Natural Language}, + author={Lisa Anne Hendricks and Oliver Wang and Eli Shechtman and Josef Sivic and Trevor Darrell and Bryan C. Russell}, + journal={2017 IEEE International Conference on Computer Vision (ICCV)}, + year={2017}, + pages={5804-5813} +} +``` + +Disclaimer: This code is implemented by [Jie Lei](http://www.cs.unc.edu/~jielei/) for the TVR dataset, +it does not guarantee the reproducibility of the original authors' results. \ No newline at end of file diff --git a/baselines/clip_alignment_with_language/__init__.py b/baselines/clip_alignment_with_language/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d2ae334fa75b3ba2568a715e4fb67ebf04c38d1 Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eab312801d33f658c8d6eb196feaba542e3eb4ba Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d01ecf759821ef426656300ffcbf3ab974383a75 Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57ced9a9be2217629a93e3a0d92727b38c11953a Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a554bc2a28878ad9c0450a798831f8d1d6abdae Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/config.py b/baselines/clip_alignment_with_language/config.py new file mode 100644 index 0000000000000000000000000000000000000000..db3ba382f839f0bd80d7becb15c753a37428bbb3 --- /dev/null +++ b/baselines/clip_alignment_with_language/config.py @@ -0,0 +1,207 @@ +import os +import time +import torch +import argparse + +from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile +from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs + + +class BaseOptions(object): + saved_option_filename = "opt.json" + ckpt_filename = "model.ckpt" + tensorboard_log_dir = "tensorboard_log" + train_log_filename = "train.log.txt" + eval_log_filename = "eval.log.txt" + + def __init__(self): + self.parser = argparse.ArgumentParser() + self.initialized = False + self.opt = None + + def initialize(self): + self.initialized = True + self.parser.add_argument("--dset_name", type=str, choices=["tvr"]) + self.parser.add_argument("--eval_split_name", type=str, default="val", + help="should match keys in corpus_path, must set for VCMR") + self.parser.add_argument("--debug", action="store_true", + help="debug (fast) mode, break all loops, do not load all data into memory.") + self.parser.add_argument("--data_ratio", type=float, default=1.0, + help="how many training and eval data to use. 1.0: use all, 0.1: use 10%." + "Use small portion for debug purposes. Note this is different from --debug, " + "which works by breaking the loops, typically they are not used together.") + self.parser.add_argument("--results_root", type=str, default="results") + self.parser.add_argument("--exp_id", type=str, default="res", help="id of the current run") + self.parser.add_argument("--seed", type=int, default=2018, help="random seed") + self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu") + self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job") + self.parser.add_argument("--num_workers", type=int, default=8, + help="num subprocesses used to load the data, 0: use main process") + self.parser.add_argument("--no_core_driver", action="store_true", + help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`") + self.parser.add_argument("--no_pin_memory", action="store_true", + help="Don't use pin_memory=True for dataloader. " + "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4") + + # training config + self.parser.add_argument("--lr", type=float, default=0.05, help="learning rate") + self.parser.add_argument("--wd", type=float, default=0, help="weight decay") + self.parser.add_argument("--momentum", type=float, default=0.95, help="momentum for SGD") + self.parser.add_argument("--n_epoch", type=int, default=108, help="number of epochs to run") + self.parser.add_argument("--max_es_cnt", type=int, default=108, help="number of epochs to early stop") + self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size") + self.parser.add_argument("--eval_query_bsz", type=int, default=1000, + help="mini-batch size at inference, for query") + self.parser.add_argument("--eval_proposal_bsz", type=int, default=200, + help="mini-batch size at inference, for proposals") + self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model") + self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable") + self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss") + self.parser.add_argument("--inter_loss_weight", type=float, default=0.4, help="margin for ranking loss") + self.parser.add_argument("--loss_type", type=str, default="hinge", choices=["hinge", "lse"], + help="att loss type, can be hinge loss or its smooth approximation LogSumExp") + + # Model and Data config + self.parser.add_argument("--max_sub_l", type=int, default=50, + help="max length of all sub sentence 97.71 under 50 for 3 sentences") + self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions") + self.parser.add_argument("--pos_iou_thd", type=float, default=0.7, help="moments with IoU >= as positive") + self.parser.add_argument("--neg_iou_thd", type=float, default=0.35, help="moments with IoU < as negative") + + self.parser.add_argument("--train_path", type=str, default=None) + self.parser.add_argument("--eval_path", type=str, default=None, + help="Evaluating during training, for Dev set. If None, will only do training, " + "anet_cap and charades_sta has no dev set, so None") + self.parser.add_argument("--external_train_vr_res_path", type=str, default=None, + help="if set, use external video retrieval results to guide " + "inter-nvideo negative sampling. ") + self.parser.add_argument("--init_ckpt_path", type=str, default=None, + help="init model parameters from checkpoint. Use absolute path") + self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None, + help="if set, use external video retrieval results to guide evaluation. ") + self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features") + self.parser.add_argument("--word2idx_path", type=str, + help="a dict, {word: word_idx, ...}, " + "special tokens are {: 0, : 1, : 2}") + self.parser.add_argument("--vocab_size", type=int, default=-1, + help="Set automatically to len(word2idx)") + self.parser.add_argument("--glove_path", type=str, + help="path to file containing the GloVe embeddings for words in word2idx") + self.parser.add_argument("--desc_bert_path", type=str, default=None) + self.parser.add_argument("--sub_bert_path", type=str, default=None) + self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature") + self.parser.add_argument("--desc_feat_size", type=int, default=768) + self.parser.add_argument("--ctx_mode", type=str, + choices=["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"], + help="which context to use. a combination of [video, sub, tef]") + self.parser.add_argument("--corpus_path", type=str, default=None) + self.parser.add_argument("--vid_feat_path", type=str, default="") + self.parser.add_argument("--no_norm_vfeat", action="store_true", + help="Do not do normalization on video feat, use it when using i3d_resnet concat feat") + self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat") + self.parser.add_argument("--clip_length", type=float, default=None, + help="each video will be uniformly segmented into small clips, " + "will automatically loaded from ProposalConfigs if None") + self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature") + + self.parser.add_argument("--model_type", default="cal", choices=["cal", "mcn"]) + self.parser.add_argument("--embedding_size", type=int, default=768) + self.parser.add_argument("--lstm_hidden_size", type=int, default=256) + self.parser.add_argument("--visual_hidden_size", type=int, default=256) + self.parser.add_argument("--output_size", type=int, default=256) + + # post processing + self.parser.add_argument("--nms_thd", type=float, default=-1, + help="additionally use non-maximum suppression " + "(or non-minimum suppression for distance)" + "to post-processing the predictions. " + "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,") + self.parser.add_argument("--max_after_nms", type=int, default=100, help="Stores at max_after_nms for eval") + self.parser.add_argument("--max_before_nms", type=int, default=300, help="Max before nms") + self.parser.add_argument("--use_intermediate", action="store_true", + help="Whether to use/save intermediate results to results directory." + "Might want use this if we are going to ") + + def save_args(self, opt): + args = vars(opt) + # Save settings + if not isinstance(self, TestOptions): + option_file_path = os.path.join(opt.results_dir, self.saved_option_filename) # not yaml file indeed + save_json(args, option_file_path, save_pretty=True) + + def parse(self): + if not self.initialized: + self.initialize() + opt = self.parser.parse_args() + + if opt.debug: + opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ]) + opt.no_core_driver = True + opt.num_workers = 0 + + if isinstance(self, TestOptions): + # modify model_dir to absolute path + opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir) + saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename)) + for arg in saved_options: # use saved options to overwrite all BaseOptions args. + if arg not in ["results_root", "num_workers", "nms_thd", "debug", "eval_split_name", "eval_path", + "use_intermediate", "external_inference_vr_res_path"]: + setattr(opt, arg, saved_options[arg]) + # opt.no_core_driver = True + else: + if opt.exp_id is None: + raise ValueError("--exp_id is required for at a training option!") + + if opt.clip_length is None: + opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"] + opt.results_dir = os.path.join(opt.results_root, + "-".join([opt.dset_name, opt.model_type, opt.ctx_mode, opt.exp_id, + time.strftime("%Y_%m_%d_%H_%M_%S")])) + mkdirp(opt.results_dir) + # save a copy of current code + code_dir = os.path.dirname(os.path.realpath(__file__)) + code_zip_filename = os.path.join(opt.results_dir, "code.zip") + make_zipfile(code_dir, code_zip_filename, + enclosing_dir="code", + exclude_dirs_substring="results", + exclude_dirs=["results", "debug_results", "__pycache__"], + exclude_extensions=[".pyc", ".ipynb", ".swap"]) + + self.save_args(opt) + + if "sub" in opt.ctx_mode: + assert opt.dset_name == "tvr", "sub is only supported for tvr dataset" + + if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d + assert opt.no_norm_vfeat + + opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) + opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) + opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) + opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) + opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") + opt.h5driver = None if opt.no_core_driver else "core" + # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5 + opt.pin_memory = not opt.no_pin_memory + opt.num_workers = 1 if opt.no_core_driver else opt.num_workers + + # Display settings + print("------------ Options -------------\n{}\n-------------------" + .format({str(k): str(v) for k, v in sorted(vars(opt).items())})) + self.opt = opt + return opt + + +class TestOptions(BaseOptions): + """add additional options for evaluating""" + def initialize(self): + BaseOptions.initialize(self) + # also need to specify --eval_split_name + self.parser.add_argument("--eval_id", type=str, help="evaluation id") + self.parser.add_argument("--model_dir", type=str, + help="dir contains the model file, will be converted to absolute path afterwards") + self.parser.add_argument("--tasks", type=str, nargs="+", choices=["VCMR", "SVMR", "VR"], default="SVMR", + help="Which tasks to run." + "VCMR: Video Corpus Moment Retrieval;" + "SVMR: Single Video Moment Retrieval;" + "VR: regular Video Retrieval.") diff --git a/baselines/clip_alignment_with_language/inference.py b/baselines/clip_alignment_with_language/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..b11d669682346aef7b8eb09788348a948847e14b --- /dev/null +++ b/baselines/clip_alignment_with_language/inference.py @@ -0,0 +1,672 @@ +import os +import time +import math +import pprint +import numpy as np +from tqdm import tqdm, trange +from collections import defaultdict, OrderedDict + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader + +from baselines.clip_alignment_with_language.config import TestOptions +from baselines.clip_alignment_with_language.model import CALWithSub +from baselines.clip_alignment_with_language.proposal_retrieval_dataset import \ + proposal_retrieval_collate, ProposalRetrievalEvalDataset, prepare_batch_inputs +from utils.basic_utils import save_jsonl, save_json, load_json +from utils.temporal_nms import temporal_non_maximum_suppression +from utils.tensor_utils import pad_sequences_1d +from standalone_eval.eval import eval_retrieval + +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list): + """ + Args: + proposals_embedding_list: list(torch.Tensor), bsz * (N_prop, N_clips, D_o) + proposals_mask_list: list(torch.Tensor), bsz * (N_prop, N_clips) + """ + if len(proposals_embedding_list) == 1: + return proposals_embedding_list[0], proposals_mask_list[0] + else: # > 1 + max_n_clips = max([e.shape[1] for e in proposals_embedding_list]) + n_proposals = sum([len(e) for e in proposals_embedding_list]) + d = proposals_embedding_list[0].shape[2] + proposals_embedding = proposals_embedding_list[0].new_zeros((n_proposals, max_n_clips, d)) + proposals_mask = proposals_mask_list[0].new_zeros((n_proposals, max_n_clips)) + mask_lengths = [0, ] + [len(m) for m in proposals_mask_list] + mask_cumsum_lengths = np.cumsum(mask_lengths) + for idx, (e, m) in enumerate(zip(proposals_embedding_list, proposals_mask_list)): + proposals_embedding[mask_cumsum_lengths[idx]:mask_cumsum_lengths[idx + 1], :e.shape[1]] = e + proposals_mask[mask_cumsum_lengths[idx]:mask_cumsum_lengths[idx + 1], :m.shape[1]] = m + return proposals_embedding, proposals_mask + + +def compute_query_embeddings(model, eval_dataset, opt, load_gt_vid_name): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated size 20,000 (query) * 100 (hsz) * 4 / (1024**2) = 7.63 MB + """ + model.eval() + eval_dataset.set_data_mode("query") + eval_dataset.load_gt_vid_name_for_query(load_gt_vid_name) + query_eval_loader = DataLoader(eval_dataset, + collate_fn=proposal_retrieval_collate, + batch_size=opt.eval_query_bsz, + num_workers=opt.num_workers, + shuffle=False, + pin_memory=opt.pin_memory) + global_meta_list = [] # list(dicts) + # n_query = min(len(eval_dataset), opt.eval_query_bsz) if opt.debug else len(eval_dataset) + n_query = len(eval_dataset) + global_query_embedding = torch.empty((n_query, + model.config.output_size), + dtype=torch.float32, device=opt.device) # (N_q, D_o) + for idx, batch in tqdm(enumerate(query_eval_loader), + desc="Computing q embedding", + total=len(query_eval_loader)): + global_meta_list.extend(batch[0]) + model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) + global_query_embedding[idx * opt.eval_query_bsz: (idx + 1) * opt.eval_query_bsz] = \ + model.query_encoder(**model_inputs) + + if opt.debug: + break + return global_meta_list, global_query_embedding + + +def compute_proposal_embeddings(model, eval_dataset, opt): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated 1000 (videos) * 300 (proposals) * 20 (clips) * 100 (hsz) * 4 / (1024 ** 3) = 2.24 GB + """ + model.eval() + eval_dataset.set_data_mode("context") + global_meta_list = [] # list(dicts) + global_proposal_video_embedding_list = [] # list(torch.tensor), N_videos * [N_prop, N_clips, D_o] + global_proposal_sub_embedding_list = [] # list(torch.tensor), N_videos * [N_prop, N_clips, D_o] + global_proposal_video_mask_list = [] # list(torch.tensor), N_videos * [N_prop, N_clips] + global_proposal_sub_mask_list = [] # list(torch.tensor), N_videos * [N_prop, N_clips] + for idx, single_video_info in tqdm(enumerate(eval_dataset), + desc="Computing prop embedding for videos", + total=len(eval_dataset)): + global_meta_list.append(single_video_info["meta"]) + if model.use_video or model.tef_only: + proposals_features_list = single_video_info["model_inputs"]["video_moment_features_list"] + proposals_mask_list = single_video_info["model_inputs"]["video_moment_mask_list"] + proposals_mask_list = [e.to(opt.device, non_blocking=opt.pin_memory) for e in proposals_mask_list] + proposals_embedding_list = [] # (N_prop, D_o) + for feat in proposals_features_list: + proposals_embedding_list.append( + model.moment_encoder(feat.to(opt.device, non_blocking=opt.pin_memory), module_name="video")) + p, m = combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list) + global_proposal_video_embedding_list.append(p) + global_proposal_video_mask_list.append(m) + else: + global_proposal_video_embedding_list.append(None) + + if model.use_sub: + proposals_features_list = single_video_info["model_inputs"]["sub_moment_features_list"] + proposals_mask_list = single_video_info["model_inputs"]["sub_moment_mask_list"] + proposals_mask_list = [e.to(opt.device, non_blocking=opt.pin_memory) for e in proposals_mask_list] + proposals_embedding_list = [] # (N_prop, D_o) + for feat in proposals_features_list: + proposals_embedding_list.append( + model.moment_encoder(feat.to(opt.device, non_blocking=opt.pin_memory), module_name="sub")) + p, m = combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list) + global_proposal_sub_embedding_list.append(p) + global_proposal_sub_mask_list.append(m) + else: + global_proposal_sub_embedding_list.append(None) + + if opt.debug and idx == 100: + break + global_proposal_mask_list = global_proposal_sub_mask_list if model.use_sub else global_proposal_video_mask_list + return global_meta_list, global_proposal_video_embedding_list, \ + global_proposal_sub_embedding_list, global_proposal_mask_list + + +def compute_query_proposal_distance(model, eval_dataset, opt, tasks=("SVMR",)): + """compute and save query and video proposal embeddings, + tasks: SVMR (single video moment retrieval), VCMR (video corpus moment retrieval) + """ + is_svmr = "SVMR" in tasks + is_vcmr = "VCMR" in tasks + query_meta_list, query_embed = compute_query_embeddings(model, eval_dataset, opt, + load_gt_vid_name=is_svmr) + video_meta_list, video_prop_embed_list, sub_prop_embed_list, prop_mask_list = \ + compute_proposal_embeddings(model, eval_dataset, opt) + + eval_res = dict( + query_meta=query_meta_list, # N_q * dict() + video_meta=video_meta_list, # N_videos * dict() + video2idx=eval_dataset.video2idx, # dict {vid_name: index} + query_prop_dist_vcmr=[], # N_videos * (N_q, N_prop), note N_prop is changing for each video. + query_prop_dist_svmr=[], # N_q * (N_prop, ), each query has a GT video, no need to calc. for all. + ) + if is_vcmr: + for v_prop_embed, s_prop_embed, prop_mask in tqdm( + zip(video_prop_embed_list, sub_prop_embed_list, prop_mask_list), + desc="Computing VCMR q to prop dist for videos", + total=len(video_prop_embed_list)): + query_prop_dist = model.compute_cdist_inference( + query_embed, v_prop_embed, s_prop_embed, prop_mask) # (N_q, N_prop) + eval_res["query_prop_dist_vcmr"].append(query_prop_dist.cpu()) + if opt.debug: + break + + if is_svmr: + if opt.debug: + debug_query_meta = [] + # this is different from video2idx + svmr_video2meta_idx = {e["vid_name"]: idx for idx, e in enumerate(video_meta_list)} + # logger.info("svmr_video2idx {}".format(list(svmr_video2idx.keys())[:3])) + for single_q_embed, single_q_meta in tqdm(zip(query_embed, query_meta_list), + desc="Computing SVMR q to prop dist for videos", + total=len(query_embed)): + # logger.info("single_q_meta[vid_name] {}".format(single_q_meta["vid_name"])) + if opt.debug: + if single_q_meta["vid_name"] not in svmr_video2meta_idx: + continue + debug_query_meta.append(single_q_meta) + q_gt_vid_meta_idx = svmr_video2meta_idx[single_q_meta["vid_name"]] + v_prop_embed = video_prop_embed_list[q_gt_vid_meta_idx] # [N_prop, N_clips, D_o] + s_prop_embed = sub_prop_embed_list[q_gt_vid_meta_idx] # [N_prop, N_clips, D_o] + prop_mask = prop_mask_list[q_gt_vid_meta_idx] # [N_prop, N_clips] + query_prop_dist = model.compute_cdist_inference( + single_q_embed.unsqueeze(0), v_prop_embed, s_prop_embed, prop_mask) # (1, N_prop) + eval_res["query_prop_dist_svmr"].append(query_prop_dist.squeeze(0).cpu().numpy()) + if opt.debug: + eval_res["query_meta"] = debug_query_meta + return eval_res + + +def filter_vcmr_by_nms(all_video_predictions, nms_threshold=0.6, + max_before_nms=1000, max_after_nms=100, score_col_idx=3): + """ Apply non-maximum suppression for all the predictions for each video. + 1) group predictions by video index + 2) apply nms individually for each video index group + 3) combine and sort the predictions + Args: + all_video_predictions: list(sublist), + Each sublist is [video_idx (int), st (float), ed(float), score (float)] + Note the scores are negative distances. + nms_threshold: float + max_before_nms: int + max_after_nms: int + score_col_idx: int + Returns: + + """ + predictions_neg_by_video_group = defaultdict(list) + for pred in all_video_predictions[:max_before_nms]: + predictions_neg_by_video_group[pred[0]].append(pred[1:]) # [st (float), ed(float), score (float)] + + predictions_by_video_group_neg_after_nms = dict() + for video_idx, grouped_preds in predictions_neg_by_video_group.items(): + predictions_by_video_group_neg_after_nms[video_idx] = \ + temporal_non_maximum_suppression(grouped_preds, nms_threshold=nms_threshold) + + predictions_after_nms = [] + for video_idx, grouped_preds in predictions_by_video_group_neg_after_nms.items(): + for pred in grouped_preds: + pred = [video_idx] + pred # [video_idx (int), st (float), ed(float), score (float)] + predictions_after_nms.append(pred) + + # ranking happens across videos + predictions_after_nms = sorted(predictions_after_nms, + key=lambda x: x[score_col_idx], + reverse=True)[:max_after_nms] # descending order + return predictions_after_nms + + +def post_processing_vcmr_nms(vcmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100): + """ + vcmr_res: list(dict), each dict is{ + "desc": str, + "desc_id": int, + "predictions": list(sublist) # each sublist is + [video_idx (int), st (float), ed(float), score (float)], video_idx could be different + } + """ + processed_vcmr_res = [] + for e in vcmr_res: + e["predictions"] = filter_vcmr_by_nms(e["predictions"], + nms_threshold=nms_thd, + max_before_nms=max_before_nms, + max_after_nms=max_after_nms) + processed_vcmr_res.append(e) + return processed_vcmr_res + + +def post_processing_svmr_nms(svmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100): + """ + svmr_res: list(dict), each dict is + {"desc": str, + "desc_id": int, + "predictions": list(sublist) # each sublist is + [video_idx (int), st (float), ed(float), score (float)], video_idx is the same. + } + """ + processed_svmr_res = [] + for e in svmr_res: + # the predictions are sorted inside the nms func. + _predictions = [d[1:] for d in e["predictions"][:max_before_nms]] + _predictions = temporal_non_maximum_suppression( + _predictions, nms_threshold=nms_thd)[:max_after_nms] + _video_id = e["predictions"][0][0] # video_id is the same for all predictions + e["predictions"] = [[_video_id, ] + d for d in _predictions] + processed_svmr_res.append(e) + return processed_svmr_res + + +def generate_vcmr_predictions_from_res_with_external(eval_res, max_prop_per_query=300, query_bsz_in_sort=1000): + """ This function is for Video Corpus Moment Retrieval (VCMR). + Generate prediction file which could be evaluated using standalone_eval.eval. + Args: + eval_res: dict( + query_meta=query_meta_list, # N_q * dict(), each dict is {"desc_id": int, "desc": str} + video_meta=video_meta_list, # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray} + video2idx=eval_dataset.video2idx, # dict {vid_name: index} + video_bsz_in_sort=[], # N_videos * (N_q, N_prop) + ) + max_prop_per_query: int or None. If None, generate ranking for all possible moments, else generate top {}. + query_bsz_in_sort: int, only sort a subset of queries at a time, it will be too large to sort all queries. + return: + list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)), + each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance. + """ + # video2idx + video2idx = eval_res["video2idx"] + video_meta = eval_res["video_meta"] + query_meta = eval_res["query_meta"] + video_idx2meta_idx = {video2idx[m["vid_name"]]: i for i, m in enumerate(video_meta)} + external_query2video = eval_res["external_query2video"] if "external_query2video" in eval_res else None + # 「query idx: [video meta idx]」 + external_query2video_meta_idx = {k: [video_idx2meta_idx[e] for e in v] for k, v in external_query2video.items()} + + external_ordered_video_meta_indices = torch.LongTensor( + [external_query2video_meta_idx[e["desc_id"]] for e in query_meta]) # (Nq, 5) + top_n_retrieved = external_ordered_video_meta_indices.shape[1] + + # (N_videos, N_prop, N_q), (N_videos, N_prop) + padded_dist, padded_mask = pad_sequences_1d([e.transpose(0, 1) for e in eval_res["query_prop_dist_vcmr"]], + dtype=eval_res["query_prop_dist_vcmr"][0].dtype, + device=eval_res["query_prop_dist_vcmr"][0].device) + # putting 'NaN' into the invalid bits, torch.sort considers 'NaN' as larger than any number!!! + padded_dist += (padded_mask.unsqueeze(2) == 0).float() * 1e10 + n_videos, n_prop, n_q = padded_dist.shape + padded_dist = padded_dist.permute(2, 0, 1) # (N_q, N_videos, N_prop) + + # get only top retrieved, N_videos now decreased to top_n_retrieved + row_indices = torch.arange(n_q, device=padded_dist.device) + padded_dist = torch.stack([ + padded_dist[row_indices, external_ordered_video_meta_indices[:, col_idx]] + for col_idx in range(top_n_retrieved)], dim=1) # (N_q, 5, N_prop) + n_videos = top_n_retrieved + + padded_dist = padded_dist.view(n_q, -1).contiguous() # (N_q, N_video*N_prop) + print("n_videos, n_prop, n_q {}".format((n_videos, n_prop, n_q))) + print("padded_dist, {}".format(padded_dist.shape)) + + sorted_distances, sorted_indices = torch.topk(padded_dist.to(torch.device("cuda:0"), non_blocking=True), + k=min(max_prop_per_query, n_videos * n_prop), + dim=1, largest=False, sorted=True) # (N_q, max_prop_per_query) * 2 + print("orted_distances {}, sorted_indices {}".format(sorted_distances.shape, sorted_indices.shape)) + sorted_distances = - sorted_distances.cpu().numpy() + + # (N_q, max_prop_per_query) * 2, prop_indices: inside video indices. + video_meta_indices_retrieved = torch.floor(sorted_indices.float() / n_prop).long().cpu().numpy() + # map back to original video idx (not video meta idx, but real video idx) + video_indices = np.array([[external_query2video[query_meta[i]["desc_id"]][j] for j in r] + for i, r in enumerate(video_meta_indices_retrieved)]) # (N_q, max_prop_per_query) + prop_indices = torch.remainder(sorted_indices, n_prop).cpu().numpy() # (N_q, max_prop_per_query) + print("video_indices {}, prop_indices {}".format(video_indices.shape, prop_indices.shape)) + + vr_res = [] + for i in trange(n_q, desc="[VR] Loop over queries to generate predictions"): + row = video_indices[i] + score_row = - sorted_distances[i] + cur_vr_redictions = [] + for j, video_idx in enumerate(row): + cur_vr_redictions.append([int(video_idx), 0, 0, float(score_row[j])]) + cur_query_pred = dict( + desc_id=query_meta[i]["desc_id"], + desc=query_meta[i]["desc"], + predictions=cur_vr_redictions + ) + vr_res.append(cur_query_pred) + + vcmr_res = [] + logger.debug("sorted_indices {}".format(sorted_indices.shape)) + logger.debug("sorted_distances {}".format(sorted_distances.shape)) + out_bounds_cnt = 0 + for idx, (v_row_indices, p_row_indices) in tqdm(enumerate(zip(video_indices, prop_indices)), + desc="[VCMR] Loop over queries to generate predictions", + total=n_q): # query + sorted_distances_row = - sorted_distances[idx] # converted to negative distance + # [video_idx(int), st(float), ed(float), score(float)] + cur_ranked_predictions = [] + for col_idx, (v_col_idx, p_col_idx) in enumerate(zip(v_row_indices, p_row_indices)): + cur_proposals = eval_res["video_meta"][video_idx2meta_idx[v_col_idx]]["proposals"] + cur_pred = [] + cur_pred += [int(v_col_idx), ] + # what is wrong with the indexing below??? (out of bounds), but results seems fine??? + # Not a bug. Since there might be less than max_before_nms proposals from the top retrieved videos + if p_col_idx >= len(cur_proposals): + out_bounds_cnt += 1 + p_col_idx = len(cur_proposals)-1 + cur_pred += cur_proposals[p_col_idx].tolist() + cur_pred += [float(sorted_distances_row[col_idx])] + cur_ranked_predictions.append(cur_pred) + cur_query_pred = dict( + desc_id=eval_res["query_meta"][idx]["desc_id"], + desc=eval_res["query_meta"][idx]["desc"], + predictions=cur_ranked_predictions + ) + vcmr_res.append(cur_query_pred) + logger.info("[DEBUG] out_bounds_cnt {}".format(out_bounds_cnt)) + return vcmr_res, vr_res + + +def generate_vcmr_predictions_from_res(eval_res, max_prop_per_query=300, query_bsz_in_sort=1000): + """ This function is for Video Corpus Moment Retrieval (VCMR). + Generate prediction file which could be evaluated using standalone_eval.eval. + Args: + eval_res: dict( + query_meta=query_meta_list, # N_q * dict(), each dict is {"desc_id": int, "desc": str} + video_meta=video_meta_list, # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray} + video2idx=eval_dataset.video2idx, # dict {vid_name: index} + video_bsz_in_sort=[], # N_videos * (N_q, N_prop) + ) + max_prop_per_query: int or None. If None, generate ranking for all possible moments, else generate top {}. + query_bsz_in_sort: int, only sort a subset of queries at a time, it will be too large to sort all queries. + return: + list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)), + each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance. + """ + # video2idx + video2idx = eval_res["video2idx"] + + # (N_videos, N_prop, N_q), (N_videos, N_prop) + padded_dist, padded_mask = pad_sequences_1d([e.transpose(0, 1) for e in eval_res["query_prop_dist_vcmr"]], + dtype=eval_res["query_prop_dist_vcmr"][0].dtype, + device=eval_res["query_prop_dist_vcmr"][0].device) + # putting 'NaN' into the invalid bits, torch.sort considers 'NaN' as larger than any number!!! + padded_dist += (padded_mask.unsqueeze(2) == 0).float() * 1e10 + n_videos, n_prop, n_q = padded_dist.shape + print("n_videos, n_prop, n_q {}".format((n_videos, n_prop, n_q))) + padded_dist = padded_dist.view(n_videos * n_prop, n_q).transpose(0, 1).contiguous() # (N_q, N_video*N_prop) + print("padded_dist, {}".format(padded_dist.shape)) + + sorted_distances, sorted_indices = torch.topk(padded_dist.to(torch.device("cuda:0"), non_blocking=True), + k=min(max_prop_per_query, n_videos * n_prop), + dim=1, largest=False, sorted=True) # (N_q, max_prop_per_query) * 2 + sorted_distances = - sorted_distances.cpu().numpy() + + # (N_q, max_prop_per_query) * 2, prop_indices: inside video indices. + video_meta_indices = torch.floor(sorted_indices.float() / n_prop).long().cpu().numpy() + prop_indices = torch.remainder(sorted_indices, n_prop).cpu().numpy() + + vr_res = [] + query_meta = eval_res["query_meta"] + for i in trange(n_q, desc="[VR] Loop over queries to generate predictions"): + row = video_meta_indices[i] + score_row = - sorted_distances[i] + cur_vr_redictions = [] + for j, meta_idx in enumerate(row): + video_idx = video2idx[eval_res["video_meta"][meta_idx]["vid_name"]] + cur_vr_redictions.append([video_idx, 0, 0, float(score_row[j])]) + cur_query_pred = dict( + desc_id=query_meta[i]["desc_id"], + desc=query_meta[i]["desc"], + predictions=cur_vr_redictions + ) + vr_res.append(cur_query_pred) + + vcmr_res = [] + logger.debug("sorted_indices {}".format(sorted_indices.shape)) + logger.debug("sorted_distances {}".format(sorted_distances.shape)) + for idx, (vm_row_indices, p_row_indices) in tqdm(enumerate(zip(video_meta_indices, prop_indices)), + desc="[VCMR] Loop over queries to generate predictions", + total=n_q): # query + sorted_distances_row = - sorted_distances[idx] # converted to negative distance + # [video_idx(int), st(float), ed(float), score(float)] + cur_ranked_predictions = [] + for col_idx, (v_col_idx, p_col_idx) in enumerate(zip(vm_row_indices, p_row_indices)): + cur_pred = [] + cur_pred += [video2idx[eval_res["video_meta"][v_col_idx]["vid_name"]], ] + cur_pred += eval_res["video_meta"][v_col_idx]["proposals"][p_col_idx].tolist() + cur_pred += [float(sorted_distances_row[col_idx])] + cur_ranked_predictions.append(cur_pred) + cur_query_pred = dict( + desc_id=eval_res["query_meta"][idx]["desc_id"], + desc=eval_res["query_meta"][idx]["desc"], + predictions=cur_ranked_predictions + ) + vcmr_res.append(cur_query_pred) + return vcmr_res, vr_res + + +def generate_svmr_predictions_from_res(eval_res, max_prop_per_query=None): + """ This function is for Video Corpus Moment Retrieval (VCMR). + Generate prediction file which could be evaluated using standalone_eval.eval. + Args: + eval_res: dict( + query_meta=query_meta_list, # N_q * dict(), each dict is {"desc_id": int, "desc": str} + video_meta=video_meta_list, # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray} + video2idx=eval_dataset.video2idx, # dict {vid_name: index} + query_prop_dist_svmr=[], # N_q * (N_prop, ) + ) + max_prop_per_query: not used + return: + list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)), + each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance. + """ + video2idx = eval_res["video2idx"] + + svmr_res = [] + svmr_video2meta_idx = {e["vid_name"]: idx for idx, e in enumerate(eval_res["video_meta"])} + for idx, (q_p_dist, q_m) in tqdm(enumerate(zip(eval_res["query_prop_dist_svmr"], eval_res["query_meta"])), + desc="Loop over queries to generate predictions", + total=len(eval_res["query_prop_dist_svmr"])): # query + sorted_indices = np.argsort(q_p_dist) # (N_prop, ) # ascending order, distance + if max_prop_per_query is not None: + sorted_indices = sorted_indices[:max_prop_per_query] + v_eval_idx = video2idx[q_m["vid_name"]] + v_meta_idx = svmr_video2meta_idx[q_m["vid_name"]] + proposals = eval_res["video_meta"][v_meta_idx]["proposals"] # (N_p, 2) + # [video_idx(int), st(float), ed(float), score(float)] + cur_ranked_predictions = [ + [v_eval_idx, ] + proposals[sort_idx].tolist() + [- round(float(q_p_dist[sort_idx]), 4), ] + for sort_idx in sorted_indices] + cur_query_pred = dict( + desc_id=q_m["desc_id"], + desc=q_m["desc"], + predictions=cur_ranked_predictions + ) + svmr_res.append(cur_query_pred) + return svmr_res + + +POST_PROCESSING_MMS_FUNC = { + "SVMR": post_processing_svmr_nms, + "VCMR": post_processing_vcmr_nms +} + + +def get_submission_top_n(submission, top_n=100): + def get_prediction_top_n(list_dict_predictions, top_n): + top_n_res = [] + for e in list_dict_predictions: + e["predictions"] = e["predictions"][:top_n] + top_n_res.append(e) + return top_n_res + + top_n_submission = dict(video2idx=submission["video2idx"], ) + for k in submission: + if k != "video2idx": + top_n_submission[k] = get_prediction_top_n(submission[k], top_n) + return top_n_submission + + +def load_external_vr_res(external_vr_res_path, top_n_vr_videos=5): + """return a mapping from desc_id to top retrieved video id""" + external_vr_res = load_json(external_vr_res_path) + external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"] + query2video = {e["desc_id"]: [sub_e[0] for sub_e in e["predictions"]] for e in external_vr_res} + return query2video + + +def eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=("SVMR",), max_before_nms=1000, max_after_nms=100): + model.eval() + logger.info("Computing scores") + logger.info("Start timing") + # times = [] # do not use + # for _ in range(3): + # st_time = time.time() + if opt.use_intermediate: + intermediate_cache_path = os.path.join(opt.results_dir, "{}_eval_res.pt".format(opt.eval_split_name)) + if not os.path.exists(intermediate_cache_path): + logger.info("Saving intermediate results {}.".format(intermediate_cache_path)) + eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks) + torch.save(eval_res, intermediate_cache_path) + else: + logger.info("Loading intermediate results {}.".format(intermediate_cache_path)) + eval_res = torch.load(intermediate_cache_path) + else: + logger.info("Running without saving intermediate results, you might want to turn on --use_intermediate.") + eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks) + # del model # We dont need model anymore + + # eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks) + + logger.info("Generating predictions from scores") + eval_submission_raw = dict(video2idx=eval_res["video2idx"]) + if "SVMR" in tasks: + eval_submission_raw["SVMR"] = generate_svmr_predictions_from_res( + eval_res, max_prop_per_query=max_before_nms) + # vcmr_loading_time = 0 + if "VCMR" in tasks: + if opt.external_inference_vr_res_path is not None: + logger.info("Using external VR results from {}".format(opt.external_inference_vr_res_path)) + # vcmr_loading_time = time.time() + eval_res["external_query2video"] = load_external_vr_res( + opt.external_inference_vr_res_path, top_n_vr_videos=5) + # vcmr_loading_time = time.time() - vcmr_loading_time + vcmr_res, vr_res = generate_vcmr_predictions_from_res_with_external( + eval_res, max_prop_per_query=max_before_nms) + else: + vcmr_res, vr_res = generate_vcmr_predictions_from_res( + eval_res, max_prop_per_query=max_before_nms) + eval_submission_raw["VCMR"] = vcmr_res + eval_submission_raw["VR"] = vr_res + # times += [time.time() - st_time - vcmr_loading_time] + # times = torch.FloatTensor(times) + IOU_THDS = (0.5, 0.7) + + logger.info("Saving/Evaluating before nms results") + submission_path = os.path.join(opt.results_dir, save_submission_filename) + eval_submission = get_submission_top_n(eval_submission_raw, top_n=max_after_nms) + if max_after_nms < 1000: + save_json(eval_submission, submission_path) + else: + torch.save(eval_submission, submission_path.replace(".json", ".pt")) + + metrics = eval_retrieval(eval_submission, eval_dataset.query_data, + iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug, + use_desc_type=opt.dset_name == "tvr") + # metrics["time_avg"] = float(times.mean()) + # metrics["time_std"] = float(times.std()) + save_metrics_path = submission_path.replace(".json", "_metrics.json") + save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False) + latest_file_paths = [submission_path, save_metrics_path] + + if opt.nms_thd != -1: + logger.info("Performing nms with nms_thd {}".format(opt.nms_thd)) + eval_submission_after_nms = dict(video2idx=eval_submission_raw["video2idx"]) + for k, nms_func in POST_PROCESSING_MMS_FUNC.items(): + if k in eval_submission_raw: + eval_submission_after_nms[k] = nms_func(eval_submission_raw[k], + nms_thd=opt.nms_thd, + max_before_nms=max_before_nms, + max_after_nms=max_after_nms) + + logger.info("Saving/Evaluating nms results") + submission_nms_path = submission_path.replace(".json", "_nms_thd_{}.json".format(opt.nms_thd)) + save_json(eval_submission_after_nms, submission_nms_path) + metrics_nms = eval_retrieval(eval_submission_after_nms, eval_dataset.query_data, + iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug) + save_metrics_nms_path = submission_nms_path.replace(".json", "_metrics.json") + save_json(metrics_nms, save_metrics_nms_path, save_pretty=True, sort_keys=False) + latest_file_paths += [submission_nms_path, save_metrics_nms_path] + else: + metrics_nms = None + return metrics, metrics_nms, latest_file_paths + + +def setup_model(opt): + """Load model from checkpoint and move to specified device""" + checkpoint = torch.load(opt.ckpt_filepath) + model = CALWithSub(checkpoint["model_cfg"]) + model.load_state_dict(checkpoint["model"]) + logger.info("Loaded model saved at epoch {} from checkpoint: {}" + .format(checkpoint["epoch"], opt.ckpt_filepath)) + + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + return model + + +def start_inference(): + logger.info("Setup config, data and model...") + opt = TestOptions().parse() + cudnn.benchmark = False + cudnn.deterministic = True + + assert opt.eval_path is not None + eval_dataset = ProposalRetrievalEvalDataset( + dset_name=opt.dset_name, + model_type=opt.model_type, + eval_split_name=opt.eval_split_name, # should only be val set + data_path=opt.eval_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + corpus_path=opt.corpus_path, + vid_feat_path_or_handler=opt.vid_feat_path, + clip_length=opt.clip_length, + eval_proposal_bsz=opt.eval_proposal_bsz, + ctx_mode=opt.ctx_mode, + data_mode="query", + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + ) + + model = setup_model(opt) + save_submission_filename = \ + "inference_{}_{}_{}_predictions_{}.json".format( + opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks)) + logger.info("Starting inference...") + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, eval_dataset, opt, save_submission_filename, tasks=opt.tasks, + max_before_nms=opt.max_before_nms, max_after_nms=opt.max_after_nms) + logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + +if __name__ == '__main__': + start_inference() diff --git a/baselines/clip_alignment_with_language/local_utils/__init__.py b/baselines/clip_alignment_with_language/local_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc b/baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dca00e79ec8256ef8f67c2db3d6d805276c2503b Binary files /dev/null and b/baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc b/baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff81922c6ee768f2dab46dfcd67c9edc0b1b356b Binary files /dev/null and b/baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc b/baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3963685d54d62927885a4c1427d81368674005b Binary files /dev/null and b/baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc differ diff --git a/baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py b/baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py new file mode 100644 index 0000000000000000000000000000000000000000..d4f4f748c41ebf3a76aaf302b6f94caf3e2098bf --- /dev/null +++ b/baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py @@ -0,0 +1,117 @@ +""" +Compute oracle upper bound for a given proposal method, which acts like +a reversed recall, where we recall the GT timestamp pairs in the set of +generated proposals. +""" +import pprint +import numpy as np +from tqdm import tqdm +from collections import Counter +from utils.basic_utils import load_jsonl, save_json +from standalone_eval.eval import compute_temporal_iou_batch +from baselines.clip_alignment_with_language.local_utils.proposal import get_proposal_interface, ProposalConfigs + + +def get_didemo_agreed_ts(times_list): + """ + input example: [[1, 1], [1, 1], [1, 1], [0, 0]], + return: [1, 1]""" + times_str_list = [tuple(e) for e in times_list] + times_str_list_counter = Counter(times_str_list) + most_frequent_times = times_str_list_counter.most_common(1)[0][0] + return most_frequent_times + + +def get_proposals_for_single_desc_video_pair(single_data, proposal_fn, dset_name): + proposal_info = dict( + vid_name=single_data["vid_name"], + desc_id=single_data["desc_id"], + gt_ts=single_data["ts"] if dset_name != "didemo" else get_didemo_agreed_ts(single_data["ts"]), + proposals=proposal_fn(video_id="", metadata={"duration": single_data["duration"]}), + ) + proposal_info["proposal_ious"] = compute_temporal_iou_batch( + proposal_info["proposals"], proposal_info["gt_ts"]) + return proposal_info + + +def get_proposals_for_videos(datalist, dset_name): + """datalist list(dict): each dict is + {"desc_id": str/int, "duration": float, "ts": [st (float), ed (float)], ...} + Note for Didemo dataset, "ts" entry is a list of [st (float), ed (float)] from different annotators, + here we use the most frequent ts, we break ties by randomly sample one + """ + proposal_interface = get_proposal_interface(dset_name) + video_proposals_list = [] + for e in tqdm(datalist, desc="Computing video proposals"): + video_proposals_list.append( + get_proposals_for_single_desc_video_pair(e, proposal_interface, dset_name)) + return video_proposals_list + + +def is_recalled_single_moment(proposal_ious, iou_thds=(0.5, 0.7)): + """ + Args: + proposal_ious: np.ndarray, shape (N_proposal, ) + iou_thds: set, temporal IoU thresholds + + Returns: + list(bool), len == len(iou_thds), indicates whether recall under a iou_thd is found. + """ + recalled = [False, ] * len(iou_thds) + for idx, iou_thd in enumerate(iou_thds): + recalled[idx] = np.sum(proposal_ious >= iou_thd) >= 1 # at least one + return recalled + + +def compute_proposal_recall_upper_bound(video_proposals_list, iou_thds=(0.5, 0.7)): + """video_proposals_list from get_proposals_for_videos()""" + iou_corrects = np.empty((len(video_proposals_list), 2), dtype=np.float32) + for idx, d in tqdm(enumerate(video_proposals_list), + desc="Computing recall for videos", + total=len(video_proposals_list)): + iou_corrects[idx] = is_recalled_single_moment(d["proposal_ious"], + iou_thds=iou_thds) + recall_by_iou = {iou_thd: float(np.mean(iou_corrects[:, idx])) + for idx, iou_thd in enumerate(iou_thds)} + return recall_by_iou + + +def main_compute_upper_bound(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-dset_name", type=str, choices=["tvr"]) + parser.add_argument("-eval_file_path", type=str, help="path to the file containing data to be evaluated") + parser.add_argument("-save_path", type=str, help="path to save the results") + parser.add_argument("-verbose", action="store_true") + args = parser.parse_args() + + eval_datalist = load_jsonl(args.eval_file_path) + video_proposals_list = get_proposals_for_videos(eval_datalist, args.dset_name) + recall_metrics = compute_proposal_recall_upper_bound(video_proposals_list, iou_thds=(0.5, 0.7)) + + video_proposals_list_by_video = {} + for p in video_proposals_list: + if p["vid_name"] in video_proposals_list_by_video: + continue + else: + video_proposals_list_by_video[p["vid_name"]] = p + video_proposals_list_by_video = list(video_proposals_list_by_video.values()) + total_n_clips_in_proposals = \ + np.sum([np.sum(e["proposals"][:, 1] - e["proposals"][:, 0]) for e in video_proposals_list_by_video]) + + results = dict( + avg_num_proposals=float(np.mean([len(e["proposals"]) for e in video_proposals_list_by_video])), + total_num_proposals=int(np.sum([len(e["proposals"]) for e in video_proposals_list_by_video])), + recall_metrics=recall_metrics, + dset_name=args.dset_name, + filename=args.eval_file_path, + proposal_config=ProposalConfigs[args.dset_name] + ) + results["avg_clip_per_proposal"] = total_n_clips_in_proposals / results["total_num_proposals"] + save_json(results, args.save_path, save_pretty=True) + if args.verbose: + pprint.pprint(results) + + +if __name__ == '__main__': + main_compute_upper_bound() diff --git a/baselines/clip_alignment_with_language/local_utils/proposal.py b/baselines/clip_alignment_with_language/local_utils/proposal.py new file mode 100644 index 0000000000000000000000000000000000000000..d81d32074f3e2f337b6ac52bb99ceb7ae869d09a --- /dev/null +++ b/baselines/clip_alignment_with_language/local_utils/proposal.py @@ -0,0 +1,181 @@ +# MIT License +# +# Copyright (c) 2018 Victor Escorcia Castillo +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================== +""" +Group multiple methods to generate salient temporal windows in a video""" +import itertools +import numpy as np + +PROPOSAL_SCHEMES = ['DidemoICCV17SS', 'SlidingWindowMSRSS'] + + +class TemporalProposalsBase: + """Base class (signature) to generate temporal candidate in video""" + def __call__(self, video_id, metadata=None, feature_collection=None): + raise NotImplementedError('Implement with the signature above') + + +class DidemoICCV17SS(TemporalProposalsBase): + """Original search space of moments proposed in ICCV-2017 + + Attributes: + clip_length_min (float) : minimum length, in seconds, of a video clip. + proposals (numpy array) : of shape [21, 2] representing all the + possible temporal segments of valid annotations of DiDeMo dataset. + It represents the search space of a temporal localization + algorithm. + + Reference: Hendricks et al. Localizing Moments in Video with Natural + Language. ICCV 2017. + """ + clip_length_min = 5.0 + + def __init__(self, *args, dtype=np.float32, **kwargs): + clips_indices = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] + for i in itertools.combinations(range(len(clips_indices)), 2): + clips_indices.append(i) + self.proposals = np.array(clips_indices, dtype=dtype) + self.proposals *= self.clip_length_min + self.proposals[:, 1] += self.clip_length_min + + def __call__(self, *args, **kwargs): + return self.proposals + + +class SlidingWindowMSRSS(TemporalProposalsBase): + """Multi-scale sliding window with relative stride within the same scale + + Attributes: + length (float) : length of smallest window. + scales (sequence of int) : duration of moments relative to + `length`. + stride (float) : relative stride between two windows with the same + duration. We used different strides for each scale rounding it + towards a multiple of `length`. Note that the minimum stride is + `length` for any window will be the `length` itself. + dtype (numpy.dtype) : + """ + + def __init__(self, length, scales, stride=0.5, round_base=0.5, dtype=np.float32): + self.length = length + self.scales = scales + self.round_base = round_base + self.relative_stride = stride + # pick strides per scale that are multiples of length + self.strides = [max(round(s * stride / round_base) * round_base, round_base) + * length for s in scales] + self.dtype = dtype + assert len(scales) > 0 + + def sliding_windows(self, t_end, t_start=0): + """sliding canonical windows over a given time interval""" + windows_ = [] + for i, stride in enumerate(self.strides): + num_i = np.ceil((t_end - t_start) / stride) + windows_i = np.empty((int(num_i), 2), dtype=np.float32) + windows_i[:, 0] = np.arange(t_start, t_end, stride) + windows_i[:, 1] = windows_i[:, 0] + self.length * self.scales[i] + windows_i[windows_i[:, 1] > t_end, 1] = t_end + windows_.append(windows_i) + # print("--------------------------------{}".format(i)) + # print(windows_i) + # import sys + # sys.exit(1) + windows = np.concatenate(windows_, axis=0) + # Hacky way to make windows fit inside video + # It implies windows at the end may not belong to the set spanned by + # length and scales. + return np.unique(windows, axis=0) + + def __call__(self, video_id, metadata=None, feature_collection=None): + """return: (N_window, 2), each row contains (start, end)""" + duration = metadata.get('duration') + assert duration is not None + return self.sliding_windows(duration) + + +ProposalConfigs = { + "didemo": { + "proposal_interface": "DidemoICCV17SS", + "clip_length": 2.5, + }, + "tvr": { + "length": 3, # min proposal length + "scales": [1, 2, 4, 8], + "stride": 0.3, + "round_base": 1, + "min_proposal_length": 3, # length * min(scales) + "clip_length": 1.5, # length should be divisible by clip_length + "proposal_interface": "SlidingWindowMSRSS", + }, + "anet_cap": { + "length": 5, + "scales": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26], + "stride": 0.3, + "round_base": 1, + "min_proposal_length": 10, # length * min(scales) + "clip_length": 5, # length * min(scales) / 2 + "proposal_interface": "SlidingWindowMSRSS", + }, + "charades_sta": { + "length": 3, + "scales": [2, 3, 4, 5, 6, 7, 8], + "stride": 0.3, + "round_base": 1, + "min_proposal_length": 6, # length * min(scales) + "clip_length": 3, # length * min(scales) / 2 + "proposal_interface": "SlidingWindowMSRSS", + }, + "profiling": { + "length": 5, + "scales": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + "stride": 0.3, + "round_base": 1, + "clip_length": 5, # length * min(scales) / 2 + "proposal_interface": "SlidingWindowMSRSS", + }, +} +""" +'min_clip_length' is used to uniformly segment the video into smaller clips, it is a half of +the 'min_proposal_length'. Thus we can enforce each moment has at least 2 clips. +""" + + +def get_proposal_interface(dset_name): + """ dset_name (str): one of ["tvr"] """ + assert dset_name in ProposalConfigs + if dset_name == "didemo": + return DidemoICCV17SS() + else: + arg_names = ["length", "scales", "stride", "round_base"] + func_args = {k: ProposalConfigs[dset_name][k] for k in arg_names} + return SlidingWindowMSRSS(**func_args) + + +if __name__ == '__main__': + test_fns_args = [(DidemoICCV17SS, (),), + (SlidingWindowMSRSS, (1.5, [2, 4, 6, 12]))] + for fn_i, args_i in test_fns_args: + proposal_fn = fn_i(*args_i) + x = proposal_fn('hola', {'duration': 15}) + if fn_i == DidemoICCV17SS: + assert len(x) == 21 diff --git a/baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt b/baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..780ec601a2d1e3ed911bcdc5d92ddcff6be8592d --- /dev/null +++ b/baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt @@ -0,0 +1,61 @@ + +""" +{'avg_num_proposals': 158.30197338228544, + 'dset_name': 'tvr', + 'filename': 'data/retrieval_release_data_with_ids/tvr_val_release.jsonl', + 'proposal_config': {'length': 3, + 'proposal_interface': 'SlidingWindowMSRSS', + 'round_base': 1, + 'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16], + 'stride': 0.3}, + 'recall_metrics': {0.5: 0.8927030563354492, 0.7: 0.6690225005149841}, + 'total_num_proposals': 344940} + + +{'avg_num_proposals': 213.3295089490592, + 'dset_name': 'tvr', + 'filename': 'data/retrieval_release_data_with_ids/tvr_val_release.jsonl', + 'proposal_config': {'length': 3, + 'min_clip_length': 1.5, + 'min_proposal_length': 3, + 'proposal_interface': 'SlidingWindowMSRSS', + 'round_base': 0.5, + 'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16], + 'stride': 0.3}, + 'recall_metrics': {0.5: 0.9612666368484497, 0.7: 0.8215695023536682}, + 'total_num_proposals': 464845} + -- + + +{'avg_num_proposals': 213.3295089490592, + 'dset_name': 'tvr', + 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl', + 'proposal_config': {'length': 3, + 'proposal_interface': 'SlidingWindowMSRSS', + 'round_base': 0.5, + 'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16], + 'stride': 0.3}, + 'recall_metrics': {0.5: 0.9612666368484497, 0.7: 0.8215695023536682}} + + +{'avg_num_proposals': 263.3845800826067, + 'dset_name': 'tvr', + 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl', + 'proposal_config': {'length': 3, + 'proposal_interface': 'SlidingWindowMSRSS', + 'round_base': 0.5, + 'scales': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16], + 'stride': 0.3}, + 'recall_metrics': {0.5: 0.9841211438179016, 0.7: 0.8567232489585876}} + + +{'avg_num_proposals': 242.97246443322626, + 'dset_name': 'tvr', + 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl', + 'proposal_config': {'length': 3, + 'proposal_interface': 'SlidingWindowMSRSS', + 'round_base': 0.5, + 'scales': [0.5, 1, 2, 3, 4, 5, 6, 7, 8], + 'stride': 0.3}, + 'recall_metrics': {0.5: 0.9608076810836792, 0.7: 0.8212941884994507}} +""" \ No newline at end of file diff --git a/baselines/clip_alignment_with_language/mix_model_prediction.py b/baselines/clip_alignment_with_language/mix_model_prediction.py new file mode 100644 index 0000000000000000000000000000000000000000..9b74d37b62c954119e33eac58d79951c32b15165 --- /dev/null +++ b/baselines/clip_alignment_with_language/mix_model_prediction.py @@ -0,0 +1,86 @@ +""" +Implement the CAL + CAL (TEF) model mentioned in +``` +@article{Escorcia2019TemporalLO, + title={Temporal Localization of Moments in Video Collections with Natural Language}, + author={Victor Escorcia and Mattia Soldan and Josef Sivic and Bernard Ghanem and Bryan Russell}, + journal={ArXiv}, + year={2019}, + volume={abs/1907.12763} +} +``` + +Methods: + 1, Give top200 predictions for each query in CAL then using CAL (TEF) to re-rank. + 2, This is approximated by re-ranking the top200 CAL using top1000 CAL(TEF) -- we assume they will be all covered. +""" + +import torch +import subprocess +import numpy as np +from tqdm import tqdm +from utils.basic_utils import load_json, save_json + + +def load_saved_res(pred_path): + if pred_path.endswith(".json"): + pred = load_json(pred_path) + else: + pred = torch.load(pred_path) + vcmr_res = {e["desc_id"]: e for e in pred["VCMR"]} + video2idx = pred["video2idx"] + return vcmr_res, video2idx + + +def main_mix_results(pred_path, tef_pred_path, save_path, max_after_nms=100): + """ + Args: + pred_path: contains top-200 VCMR predictions + tef_pred_path: contains top-1000 VCMR predictions + save_path: + max_after_nms: int, + Returns: + save + """ + vcmr_res, video2idx = load_saved_res(pred_path) + tef_vcmr_res, video2idx = load_saved_res(tef_pred_path) + + reranked_vcmr_res = {} + num_valid = [] + for desc_id, preds in tqdm(vcmr_res.items(), desc="Loop over the predictions"): + tef_preds = tef_vcmr_res[desc_id]["predictions"] + pred_moments = set([tuple(e[:3]) for e in preds["predictions"]]) + reranked_moments = [e for e in tef_preds if tuple(e[:3]) in pred_moments][:max_after_nms] + num_valid += [len(reranked_moments)] + if len(reranked_moments) != 100: + reranked_moments += reranked_moments[:100 - len(reranked_moments)] + reranked_vcmr_res[desc_id] = dict( + predictions=reranked_moments, + desc_id=desc_id, + desc=preds["desc"] + ) + + print("There are {} moments founded on average".format(np.mean(num_valid))) + reranked_predictions = dict( + VCMR=list(reranked_vcmr_res.values()), + video2idx=video2idx + ) + + save_json(reranked_predictions, save_path) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--pred_path", type=str, help="path to prediction res") + parser.add_argument("--tef_pred_path", type=str, help="path to TEF prediction res") + parser.add_argument("--save_path", type=str, help="path to save the re-ranked predictions, same dir as --pred_path") + parser.add_argument("--gt_path", type=str, help="path to ground truth file") + args = parser.parse_args() + + main_mix_results(args.pred_path, args.tef_pred_path, args.save_path) + + metrics_path = args.save_path.replace(".json", "_metrics.json") + eval_cmd = "python standalone_eval/eval.py --submission_path " + args.save_path + " --gt_path " + args.gt_path + \ + " --save_path " + metrics_path + results = subprocess.run(eval_cmd, shell=True) diff --git a/baselines/clip_alignment_with_language/model.py b/baselines/clip_alignment_with_language/model.py new file mode 100644 index 0000000000000000000000000000000000000000..06e7ec39f34a60ec8fa7be070fdd24659e536a6c --- /dev/null +++ b/baselines/clip_alignment_with_language/model.py @@ -0,0 +1,299 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from utils.model_utils import RNNEncoder +from easydict import EasyDict as edict + + +cal_base_cfg = edict( + visual_input_size=2048, # changes based on visual input type + textual_input_size=768, + query_feat_size=768, + visual_hidden_size=500, # + output_size=100, + embedding_size=768, + lstm_hidden_size=1000, + margin=0.1, # margin for ranking loss + loss_type="hinge", # loss type, 'hinge' or 'lse' + inter_loss_weight=0.4, # weight for inter negatives + ctx_mode="video" +) + + +class CAL(nn.Module): + def __init__(self, config): + super(CAL, self).__init__() + self.config = config + + self.moment_mlp = nn.Sequential( + nn.Linear(config.visual_input_size, config.visual_hidden_size), + nn.ReLU(True), + nn.Linear(config.visual_hidden_size, config.output_size), + ) + + self.query_lstm = RNNEncoder(word_embedding_size=config.embedding_size, + hidden_size=config.lstm_hidden_size, + bidirectional=False, + rnn_type="lstm", + dropout_p=0, + n_layers=1, + return_outputs=False) + + self.query_linear = nn.Linear(config.lstm_hidden_size, config.output_size) + + def moment_encoder(self, moment_feat): + """moment_feat: (N, L_clip, D_v)""" + return F.normalize(self.moment_mlp(moment_feat), p=2, dim=-1) # (N, L_clip, D_o) + + def query_encoder(self, query_feat, query_mask): + """ + Args: + query_feat: (N, L_q, D_q), torch.float32 + query_mask: (N, L_q), torch.float32, with 1 indicates valid query, 0 indicates mask + """ + _, hidden = self.query_lstm(query_feat, torch.sum(query_mask, dim=1).long()) + return F.normalize(self.query_linear(hidden), p=2, dim=-1) # (N, D_o) + + def compute_pdist(self, query_embedding, moment_feat, moment_mask): + """ pairwise L2 distance + Args: + query_embedding: (N, D_o) + moment_feat: (N, L_clip, D_v) + moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding + """ + moment_embedding = self.moment_encoder(moment_feat) # (N, L_clip, D_o) + moment_clip_dist = torch.sum((moment_embedding - query_embedding.unsqueeze(1)) ** 2, dim=2) # (N, L_clip) + moment_dist = torch.sum(moment_clip_dist * moment_mask, dim=1) / moment_mask.sum(1) # (N, ) + return moment_dist # (N, ) + + @classmethod + def compute_cdist_inference(cls, query_embeddings, moment_embeddings, moment_mask): + """ Compute L2 distance for every possible pair of queries and proposals. This is different from + compute_pdist as the latter computes only pairs at each row. + Args: + query_embeddings: (N_q, D_o) + moment_embeddings: (N_prop, N_clips, D_o) + moment_mask: (N_prop, N_clips) + return: + query_moment_scores: (N_q, N_prop) + """ + # sync device + query_device = query_embeddings.device # convert to cuda if we want to use GPU + if moment_embeddings.device != query_device: + moment_embeddings = moment_embeddings.to(query_device) + moment_mask = moment_mask.to(query_device) + + # compute + n_query = query_embeddings.shape[0] + n_prop, n_clips, d = moment_embeddings.shape + query_clip_dist = torch.cdist( + query_embeddings, moment_embeddings.reshape(-1, d), p=2) ** 2 # (N_q, N_prop * N_clips) + query_clip_dist = query_clip_dist.reshape(n_query, n_prop, n_clips) + query_moment_dist = torch.sum( + query_clip_dist * moment_mask.unsqueeze(0), dim=2) / moment_mask.sum(1).unsqueeze(0) + return query_moment_dist # (N_q, N_prop) + + def forward(self, query_feat, query_mask, pos_moment_feat, pos_moment_mask, + intra_neg_moment_feat, intra_neg_moment_mask, + inter_neg_moment_feat, inter_neg_moment_mask): + """ + Args: + query_feat: (N, L, D_q) + query_mask: (N, L) + pos_moment_feat: (N, L_clip_1, D_v) + pos_moment_mask: (N, L_clip_1) + intra_neg_moment_feat: (N, L_clip_2, D_v) + intra_neg_moment_mask: (N, L_clip_2) + inter_neg_moment_feat: (N, L_clip_3, D_v) + inter_neg_moment_mask: (N, L_clip_2) + """ + query_embed = self.query_encoder(query_feat, query_mask) # (N, D_o) + pos_dist = self.compute_pdist(query_embed, pos_moment_feat, pos_moment_mask) # (N, ) + intra_neg_dist = self.compute_pdist(query_embed, intra_neg_moment_feat, intra_neg_moment_mask) # (N, ) + if self.config.inter_loss_weight == 0: # should be zero for tef_only method. + loss_inter = 0. + else: + inter_neg_dist = self.compute_pdist(query_embed, inter_neg_moment_feat, inter_neg_moment_mask) # (N, ) + loss_inter = self.calc_loss(pos_dist, inter_neg_dist) + + loss = self.calc_loss(pos_dist, intra_neg_dist) + self.config.inter_loss_weight * loss_inter + return loss + + def calc_loss(self, pos_dist, neg_dist): + """ Note here we encourage positive distance to be smaller than negative distance. + Args: + pos_dist: (N, ), torch.float32 + neg_dist: (N, ), torch.float32 + """ + if self.config.loss_type == "hinge": # max(0, m + S_pos - S_neg) + return torch.clamp(self.config.margin + pos_dist - neg_dist, min=0).sum() / len(pos_dist) + elif self.config.loss_type == "lse": # log[1 + exp(S_pos - S_neg)] + return torch.log1p(torch.exp(pos_dist - neg_dist)).sum() / len(pos_dist) + else: + raise NotImplementedError("Only support 'hinge' and 'lse'") + + +class CALWithSub(nn.Module): + def __init__(self, config): + super(CALWithSub, self).__init__() + self.config = config + self.use_video = "video" in config.ctx_mode + self.use_sub = "sub" in config.ctx_mode + self.use_tef = "tef" in config.ctx_mode + self.tef_only = self.use_tef and not self.use_video and not self.use_sub + + if self.use_video or self.tef_only: + self.video_moment_mlp = nn.Sequential( + nn.Linear(config.visual_input_size, config.visual_hidden_size), + nn.ReLU(True), + nn.Linear(config.visual_hidden_size, config.output_size), + ) + + if self.use_sub: + self.sub_moment_mlp = nn.Sequential( + nn.Linear(config.textual_input_size, config.visual_hidden_size), + nn.ReLU(True), + nn.Linear(config.visual_hidden_size, config.output_size), + ) + + self.query_lstm = RNNEncoder(word_embedding_size=config.query_feat_size, + hidden_size=config.lstm_hidden_size, + bidirectional=False, + rnn_type="lstm", + dropout_p=0, + n_layers=1, + return_outputs=False) + + self.query_linear = nn.Linear(config.lstm_hidden_size, config.output_size) + + def moment_encoder(self, moment_feat, module_name="video"): + """moment_feat: (N, L_clip, D_v)""" + if moment_feat is not None: + encoder = getattr(self, module_name + "_moment_mlp") + return F.normalize(encoder(moment_feat), p=2, dim=-1) # (N, L_clip, D_o) + else: + return None + + def query_encoder(self, query_feat, query_mask): + """ + Args: + query_feat: (N, L_q, D_q), torch.float32 + query_mask: (N, L_q), torch.float32, with 1 indicates valid query, 0 indicates mask + """ + _, hidden = self.query_lstm(query_feat, torch.sum(query_mask, dim=1).long()) + return F.normalize(self.query_linear(hidden), p=2, dim=-1) # (N, D_o) + + def _compute_pdist(self, query_embedding, moment_feat, moment_mask, module_name="video"): + """ pairwise L2 distance + Args: + query_embedding: (N, D_o) + moment_feat: (N, L_clip, D_v) + moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding + """ + moment_embedding = self.moment_encoder(moment_feat, module_name=module_name) # (N, L_clip, D_o) + moment_clip_dist = torch.sum((moment_embedding - query_embedding.unsqueeze(1)) ** 2, dim=2) # (N, L_clip) + moment_dist = torch.sum(moment_clip_dist * moment_mask, dim=1) / moment_mask.sum(1) # (N, ) + return moment_dist # (N, ) + + def compute_pdist(self, query_embedding, moment_video_feat, moment_sub_feat, moment_mask): + """ pairwise L2 distance + Args: + query_embedding: (N, D_o) + moment_video_feat: (N, L_clip, D_v) + moment_sub_feat: (N, L_clip, D_t) + moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding + """ + divisor = (self.use_video or self.tef_only) + self.use_sub + video_moment_dist = self._compute_pdist(query_embedding, moment_video_feat, moment_mask, module_name="video") \ + if self.use_video or self.tef_only else 0 + sub_moment_dist = self._compute_pdist(query_embedding, moment_sub_feat, moment_mask, module_name="sub") \ + if self.use_sub else 0 + return (video_moment_dist + sub_moment_dist) / divisor # (N, ) + + def _compute_cdist_inference(self, query_embeddings, moment_embeddings, moment_mask): + """ Compute L2 distance for every possible pair of queries and proposals. This is different from + compute_pdist as the latter computes only pairs at each row. + Args: + query_embeddings: (N_q, D_o) + moment_embeddings: (N_prop, N_clips, D_o) + moment_mask: (N_prop, N_clips) + return: + query_moment_scores: (N_q, N_prop) + """ + # sync device + query_device = query_embeddings.device # convert to cuda if we want to use GPU + if moment_embeddings.device != query_device: + moment_embeddings = moment_embeddings.to(query_device) + moment_mask = moment_mask.to(query_device) + + # compute + n_query = query_embeddings.shape[0] + n_prop, n_clips, d = moment_embeddings.shape + query_clip_dist = torch.cdist( + query_embeddings, moment_embeddings.reshape(-1, d), p=2) ** 2 # (N_q, N_prop * N_clips) + query_clip_dist = query_clip_dist.reshape(n_query, n_prop, n_clips) + query_moment_dist = torch.sum( + query_clip_dist * moment_mask.unsqueeze(0), dim=2) / moment_mask.sum(1).unsqueeze(0) + return query_moment_dist # (N_q, N_prop) + + def compute_cdist_inference(self, query_embeddings, video_moment_embeddings, sub_moment_embeddings, moment_mask): + divisor = (self.use_video or self.tef_only) + self.use_sub + video_moment_dist = self._compute_cdist_inference(query_embeddings, video_moment_embeddings, moment_mask) \ + if self.use_video or self.tef_only else 0 + sub_moment_dist = self._compute_cdist_inference(query_embeddings, sub_moment_embeddings, moment_mask) \ + if self.use_sub else 0 + return (video_moment_dist + sub_moment_dist) / divisor # (N_q, N_prop) + + def forward(self, query_feat, query_mask, pos_moment_video_feat, pos_moment_video_mask, + intra_neg_moment_video_feat, intra_neg_moment_video_mask, + inter_neg_moment_video_feat, inter_neg_moment_video_mask, + pos_moment_sub_feat, pos_moment_sub_mask, + intra_neg_moment_sub_feat, intra_neg_moment_sub_mask, + inter_neg_moment_sub_feat, inter_neg_moment_sub_mask): + """ + Args: + query_feat: (N, L, D_q) + query_mask: (N, L) + pos_moment_video_feat: (N, L_clip_1, D_v) + pos_moment_video_mask: (N, L_clip_1) + intra_neg_moment_video_feat: (N, L_clip_2, D_v) + intra_neg_moment_video_mask: (N, L_clip_2) + inter_neg_moment_video_feat: (N, L_clip_3, D_v) + inter_neg_moment_video_mask: (N, L_clip_2) + pos_moment_sub_feat: + pos_moment_sub_mask: + intra_neg_moment_sub_feat: + intra_neg_moment_sub_mask: + inter_neg_moment_sub_feat: + inter_neg_moment_sub_mask: + """ + query_embed = self.query_encoder(query_feat, query_mask) # (N, D_o) + pos_dist = self.compute_pdist( + query_embed, pos_moment_video_feat, pos_moment_sub_feat, + moment_mask=pos_moment_sub_mask if self.use_sub else pos_moment_video_mask) # (N, ) + intra_neg_dist = self.compute_pdist( + query_embed, intra_neg_moment_video_feat, intra_neg_moment_sub_feat, + moment_mask=intra_neg_moment_sub_mask if self.use_sub else intra_neg_moment_video_mask) # (N, ) + if self.config.inter_loss_weight == 0: # should be zero for tef_only method. + loss_inter = 0. + else: + inter_neg_dist = self.compute_pdist( + query_embed, inter_neg_moment_video_feat, inter_neg_moment_sub_feat, + moment_mask=inter_neg_moment_sub_mask if self.use_sub else inter_neg_moment_video_mask) # (N, ) + loss_inter = self.calc_loss(pos_dist, inter_neg_dist) + + loss = self.calc_loss(pos_dist, intra_neg_dist) + self.config.inter_loss_weight * loss_inter + return loss + + def calc_loss(self, pos_dist, neg_dist): + """ Note here we encourage positive distance to be smaller than negative distance. + Args: + pos_dist: (N, ), torch.float32 + neg_dist: (N, ), torch.float32 + """ + if self.config.loss_type == "hinge": # max(0, m + S_pos - S_neg) + return torch.clamp(self.config.margin + pos_dist - neg_dist, min=0).sum() / len(pos_dist) + elif self.config.loss_type == "lse": # log[1 + exp(S_pos - S_neg)] + return torch.log1p(torch.exp(pos_dist - neg_dist)).sum() / len(pos_dist) + else: + raise NotImplementedError("Only support 'hinge' and 'lse'") diff --git a/baselines/clip_alignment_with_language/proposal_retrieval_dataset.py b/baselines/clip_alignment_with_language/proposal_retrieval_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..dcbd86c61b565d6654db854e33fa9ef4b623a906 --- /dev/null +++ b/baselines/clip_alignment_with_language/proposal_retrieval_dataset.py @@ -0,0 +1,587 @@ +""" +Dataset for clip model +""" +import logging +import torch +from torch.utils.data import Dataset +import numpy as np +import h5py +import math +import random +from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array +from utils.tensor_utils import pad_sequences_1d +from baselines.clip_alignment_with_language.local_utils.proposal import get_proposal_interface +from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \ + get_didemo_agreed_ts +from standalone_eval.eval import compute_temporal_iou_batch + +logger = logging.getLogger(__name__) + + +class ProposalRetrievalDataset(Dataset): + """ + Args: + dset_name, str, ["tvr"] + ctx_mode: str, + pos_iou_thd: float, in [0, 1], >= pos_iou_thd are defined as positive + neg_iou_thd: float, in [0, 1], < neg_iou_thd are defined as negative + Return: + a dict: { + "meta": { + "desc_id": int, + "desc": str, + "vid_name": str, + "duration": float, + "ts": [st (float), ed (float)], seconds, ground_truth timestamps + "pos_moment": [st (float), ed (float)], seconds, IoU with "ts" >= pos_iou_thd + "intra_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd + "inter_neg_vid_name": str, + "inter_neg_duration": float, + "inter_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd + } + "model_inputs": { + "desc_feat": torch.tensor, (L, D_t) + "pos_moment_feat": torch.tensor, (n_clip_in_moment, D) + "intra_neg_moment_feat": torch.tensor, (n_clip_in_moment, D) + "inter_neg_moment_feat": torch.tensor, (n_clip_in_moment, D) + } + } + """ + def __init__(self, dset_name, data_path, desc_bert_path, sub_bert_path, max_desc_len, + vid_feat_path, clip_length, vid_feat_size, sub_feat_size=0, ctx_mode="video_tef", + pos_iou_thd=0.7, neg_iou_thd=0.3, h5driver=None, data_ratio=1.0, + normalize_vfeat=True, normalize_tfeat=True, model_type="cal", + external_train_vr_res_path=None, corpus_path=None): + self.dset_name = dset_name + self.model_type = model_type + self.pool_local = model_type == "mcn" # pool local feature + self.data_path = data_path + self.data_ratio = data_ratio + + self.desc_bert_path = desc_bert_path + self.max_desc_len = max_desc_len + self.sub_bert_path = sub_bert_path + + self.vid_feat_path = vid_feat_path + self.clip_length = clip_length + self.ctx_mode = ctx_mode + + self.pos_iou_thd = pos_iou_thd + self.neg_iou_thd = neg_iou_thd + + self.vid_feat_output_size = 2 * vid_feat_size * ("video" in ctx_mode) + 2 * ("tef" in ctx_mode) + self.sub_feat_output_size = 2 * sub_feat_size * ("sub" in ctx_mode) + 2 * ("tef" in ctx_mode) + + # prepare desc data + self.data = load_jsonl(data_path) + if self.data_ratio != 1: + n_examples = int(len(self.data) * data_ratio) + self.data = self.data[:n_examples] + logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) + + self.proposal_fn = get_proposal_interface(dset_name) + if self.ctx_mode != "tef": + self.vid_feat_h5 = h5py.File(self.vid_feat_path, "r", driver=h5driver) + self.desc_bert_h5 = h5py.File(self.desc_bert_path, "r", driver=h5driver) + if "sub" in self.ctx_mode: + self.sub_bert_h5 = h5py.File(self.sub_bert_path, "r", driver=h5driver) + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if external_train_vr_res_path is not None: + video_data = load_json(corpus_path)["train"] + # {video_idx: [vid_name, vid_duration]} + video_idx2name_dur_pair = {v[1]: [k, v[0]] for k, v in video_data.items()} + external_vr_res = load_json(external_train_vr_res_path) + # {desc_id: [(vid_name, vid_duration), ...]} + self.desc_id2video_names_dur_pairs = \ + {e["desc_id"]: [video_idx2name_dur_pair[int(sub_e[0])] for sub_e in e["predictions"]] + for e in external_vr_res["VR"]} # ordered + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + raw_data = self.data[index] + + # initialize with basic data + meta = dict( + desc_id=raw_data["desc_id"], + desc=raw_data["desc"], + vid_name=raw_data["vid_name"], + duration=raw_data["duration"], + ts=raw_data["ts"] if self.dset_name != "didemo" else get_didemo_agreed_ts(raw_data["ts"]), + ) + model_inputs = dict() + query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + model_inputs["query_feat"] = torch.from_numpy(query_feat) + + # sample positive and negative moments + meta["pos_moment"] = self.align_ts_to_clip_boundaries(meta["duration"], meta["ts"]) + meta["intra_neg_moment"] = self.sample_intra_neg_moment(meta["duration"], meta["ts"]) + meta["inter_neg_moment"], meta["inter_neg_vid_name"], meta["inter_neg_duration"] = \ + self.sample_inter_video_negative(meta["vid_name"], meta["pos_moment"] / meta["duration"], + desc_id=meta["desc_id"]) + + pos_tef, intra_neg_tef, inter_neg_tef = (None,) * 3 + if self.use_tef: + pos_tef = meta["pos_moment"] / meta["duration"] # temporal endpoint feature, (2, ) + intra_neg_tef = meta["intra_neg_moment"] / meta["duration"] + inter_neg_tef = meta["inter_neg_moment"] / meta["inter_neg_duration"] + + if self.use_video: + pos_v_feat = self.vid_feat_h5[meta["vid_name"]] # (N_frm, D) + neg_v_feat = self.vid_feat_h5[meta["inter_neg_vid_name"]] + pos_v_ctx_feat = np.mean(pos_v_feat, axis=0) + neg_v_ctx_feat = np.mean(neg_v_feat, axis=0) + if self.normalize_vfeat: + pos_v_ctx_feat = l2_normalize_np_array(pos_v_ctx_feat) + neg_v_ctx_feat = l2_normalize_np_array(neg_v_ctx_feat) + pos_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["pos_moment"], + normalize=self.normalize_vfeat, + fix_outbound=True, pool_local=self.pool_local) + intra_neg_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["intra_neg_moment"], + normalize=self.normalize_vfeat, + fix_outbound=True, pool_local=self.pool_local) + inter_neg_moment_v_feat = self.get_moment_feat(neg_v_feat, meta["inter_neg_moment"], + normalize=self.normalize_vfeat, + fix_outbound=True, pool_local=self.pool_local) + + # concat features, [video_clip_feat; video_context_feat; temporal_endpoint_feat] + model_inputs["pos_moment_video_feat"] = self.concat_feat_adv( + moment_feats=[pos_moment_v_feat, pos_v_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode) + model_inputs["intra_neg_moment_video_feat"] = self.concat_feat_adv( + moment_feats=[intra_neg_moment_v_feat, pos_v_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode) + model_inputs["inter_neg_moment_video_feat"] = self.concat_feat_adv( + moment_feats=[inter_neg_moment_v_feat, neg_v_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode) + else: + for k in ["pos_moment_video_feat", "intra_neg_moment_video_feat", "inter_neg_moment_video_feat"]: + model_inputs[k] = torch.zeros((2, 2)) + + if self.use_sub: # no need for ctx feature, as the features are already contextulized + pos_s_feat = self.sub_bert_h5[meta["vid_name"]] # (N_words, D_t) + neg_s_feat = self.sub_bert_h5[meta["inter_neg_vid_name"]] + pos_s_ctx_feat = np.mean(pos_s_feat, axis=0) + neg_s_ctx_feat = np.mean(neg_s_feat, axis=0) + if self.normalize_tfeat: + pos_s_ctx_feat = l2_normalize_np_array(pos_s_ctx_feat) + neg_s_ctx_feat = l2_normalize_np_array(neg_s_ctx_feat) + pos_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["pos_moment"], + normalize=self.normalize_tfeat, + fix_outbound=True, pool_local=self.pool_local) + intra_neg_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["intra_neg_moment"], + normalize=self.normalize_tfeat, + fix_outbound=True, pool_local=self.pool_local) + inter_neg_moment_s_feat = self.get_moment_feat(neg_s_feat, meta["inter_neg_moment"], + normalize=self.normalize_tfeat, + fix_outbound=True, pool_local=self.pool_local) + + # concat features, [sub_clip_feat; sub_context_feat; temporal_endpoint_feat] + model_inputs["pos_moment_sub_feat"] = self.concat_feat_adv( + moment_feats=[pos_moment_s_feat, pos_s_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode) + model_inputs["intra_neg_moment_sub_feat"] = self.concat_feat_adv( + moment_feats=[intra_neg_moment_s_feat, pos_s_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode) + model_inputs["inter_neg_moment_sub_feat"] = self.concat_feat_adv( + moment_feats=[inter_neg_moment_s_feat, neg_s_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode) + else: + for k in ["pos_moment_sub_feat", "intra_neg_moment_sub_feat", "inter_neg_moment_sub_feat"]: + model_inputs[k] = torch.zeros((2, 2)) + + if not self.use_sub and not self.use_video and self.use_tef: # use video stream + model_inputs["pos_moment_video_feat"] = \ + self.concat_feat_adv(tef=pos_tef, ctx_mode=self.ctx_mode) + model_inputs["intra_neg_moment_video_feat"] = \ + self.concat_feat_adv(tef=intra_neg_tef, ctx_mode=self.ctx_mode) + model_inputs["inter_neg_moment_video_feat"] = \ + self.concat_feat_adv(tef=inter_neg_tef, ctx_mode=self.ctx_mode) + return dict(meta=meta, model_inputs=model_inputs) + + def align_ts_to_clip_boundaries(self, duration, ts): + """ # TODO Do we really need this??? + Generate a moment [st, ed] that is most close to a clip boundary, + st and ed must be a multiple of self.clip_length, and ed <= duration + duration: float, + ts: [st (float), ed (float)], ground_truth ts + """ + clip_aligned_ts = np.array([math.floor(ts[0] / self.clip_length), + math.ceil(ts[1] / self.clip_length)]) * self.clip_length + clip_aligned_ts[1] = min(clip_aligned_ts[1], duration) + return clip_aligned_ts + + def sample_intra_neg_moment(self, duration, ts): + """ Generate a intra negative moment given the video duration and the GT ts. + The returned moment will be aligned to clip boundaries. + 1) neg_moment has at least 2 clips + 2) its iou with ts should be < self.neg_iou_thd + Args: + duration: float + ts: [st (float), ed (float)], ground_truth ts + + Returns: + + """ + max_n_search = 5 # search at most max_n_search times, so the program will not be stuck in infinite loops. + sampled_moments = self.sample_ts_at_clip_boundaries(duration, n_pairs=max_n_search) # (n_pairs, 2) + sampled_moments_ious = compute_temporal_iou_batch(sampled_moments, ts) # (n_pairs, ) + smallest_iou_idx = np.argmin(sampled_moments_ious) + sampled_moment = sampled_moments[smallest_iou_idx] + # only a small number (<20 with max_n_search==10) of samples are wrong, + # usually when the video_duration is too short. + # if sampled_moments_ious[smallest_iou_idx] >= self.neg_iou_thd: + # logger.warning("the sampled intra-neg might be wrong. " + # "v_dur {}, ts {}, sampled neg moment {}, iou {}" + # .format(duration, ts, sampled_moment, sampled_moments_ious[smallest_iou_idx])) + return sampled_moment + + def sample_ts_at_clip_boundaries(self, duration, n_pairs=1): + """sample n_pairs moment at clip boundaries, each has at least two clips.""" + # '+ self.clip_length' since we assume indexing using [clip_st_idx, clip_ed_idx), + moments = np.random.randint(0, np.ceil(duration / self.clip_length), size=(n_pairs, 2)) + moments = np.sort(moments, axis=1) * self.clip_length + less_equal = moments[:, 1] - moments[:, 0] <= self.clip_length + start_zero = moments[:, 0] == 0 + moments[:, 1][less_equal * start_zero] += self.clip_length + moments[:, 0][less_equal * (start_zero == False)] -= self.clip_length # keep as bool!!! + return moments + + def sample_inter_video_negative(self, pos_vid_name, normalized_pos_moment, desc_id=None): + """Sample a negative moment --> negative video + similar normalized moment. + 1) they are not from the same video + Args: + pos_vid_name: str, + normalized_pos_moment: np.ndarray, (2, ), value in [0, 1], normalized by duration. + desc_id: str + Returns: + moment: np.ndarray, (2, ), ts aligned to clip boundaries. + + """ + use_guided_negative = hasattr(self, "desc_id2video_names_dur_pairs") + if use_guided_negative: + top_videos = self.desc_id2video_names_dur_pairs[desc_id] + max_idx = len(top_videos) - 1 + + while True: # usually only run once. + if use_guided_negative: + sampled_idx = min(max_idx, int(random.expovariate(0.1))) + sampled_video_name, sampled_video_dur = top_videos[sampled_idx] + else: + neg_vid_data = self.data[int(random.random() * len(self))] + sampled_video_name, sampled_video_dur = neg_vid_data["vid_name"], neg_vid_data["duration"] + if sampled_video_name != pos_vid_name: + inter_neg_moment = self.align_ts_to_clip_boundaries( + sampled_video_dur, sampled_video_dur * normalized_pos_moment) + break + + return inter_neg_moment, sampled_video_name, sampled_video_dur + + @classmethod + def get_clip_indices_from_moments(cls, moment, clip_length): + clip_st_ed_indices = moment / clip_length + return math.floor(clip_st_ed_indices[0]), math.ceil(clip_st_ed_indices[1]) + + def get_moment_feat(self, vid_feat, moment, normalize=True, fix_outbound=False, pool_local=False): + """Each moment contains multiple clips. + Inside means [moment[0], moment[1]] (seconds) + Args: + vid_feat: np.ndarray, (N_clips, D) + moment: [st (float), ed (float)], np.ndarray + normalize: L2 normalize features + fix_outbound: bool, + pool_local: whether to mean pool the features + Returns: + moment_feature: np.ndarray, ((moment[1] - moment[0]) / clip_length, D) or (D, ) + """ + clip_st_idx, clip_ed_idx = self.get_clip_indices_from_moments(moment, self.clip_length) + if fix_outbound: + vid_feat_len = len(vid_feat) + if clip_st_idx >= vid_feat_len: + clip_st_idx = vid_feat_len - 2 + moment_feat = vid_feat[clip_st_idx:clip_ed_idx] # indexed as [st, ed) + if pool_local: + moment_feat = np.mean(moment_feat, axis=0, keepdims=True) + if normalize: + moment_feat = l2_normalize_np_array(moment_feat) + return moment_feat # (n_clip_in_moment, D) or (D, ) + + @classmethod + def concat_feat_adv(cls, moment_feats=None, tef=None, to_torch=True, ctx_mode="tef"): + """ Concat moment_feat with other_feats and tef. All the features should be L2 normalized before concatenating + Args: + moment_feats: list of feats, one of them might be None. Other possible values are + ctx_feat (D, ) or sub(vid)_moment_feat (N_p, N_clips, D_t) or (N_clips, D_t). + The first non-None feature array is used as base for the rest to concatenate with. + tef: (N_p, 2) or (2, ), np.ndarray + to_torch: convert resulting np.ndarray to torch.tensor + ctx_mode: + """ + if ctx_mode == "tef": + assembled_feat = np.expand_dims(tef, axis=-2) + else: # concat moment_feat with all other_feats + moment_feats = [e for e in moment_feats if e is not None] # remove possible None (placeholder) + extra_dims = moment_feats[0].shape[:-1] # all others will need to broadcast to match it. + if isinstance(extra_dims, int): # happens when len(moment_feat.shape) == 2 + extra_dims = (extra_dims, ) + last_dim_lengths = [0, ] + [e.shape[-1] for e in moment_feats] + if "tef" in ctx_mode: # add tef + last_dim_lengths += [2, ] + moment_feats += [np.expand_dims(tef, axis=-2), ] + + if len(moment_feats) > 1: + assembled_feat = np.empty(extra_dims + (sum(last_dim_lengths), ), dtype=np.float32) + last_dim_lengths_cumsum = [sum(last_dim_lengths[0:idx+1]) for idx in range(len(last_dim_lengths))] + for idx, feat in enumerate(moment_feats): + assembled_feat[..., last_dim_lengths_cumsum[idx]:last_dim_lengths_cumsum[idx+1]] = feat + else: + assembled_feat = moment_feats[0] + + if to_torch: + return torch.from_numpy(assembled_feat) + else: + return assembled_feat # (N_prop, N_clips, D_concat) or (N_clips, D_concat) + + +class ProposalRetrievalEvalDataset(Dataset): + """ + init_data_mode: `video_query` or `video_only` or `query_only`, + it indicates which data to load when initialize the Dataset object. + data_mode: `context` or `query`, it indicates which data to return for self.__get_item__() + desc_bert_path_or_handler: h5py.File object or str path + vid_feat_path_or_handler: h5py.File object or str path + eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with + max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals. + load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval. + data_ratio: percentage of query data to use. + """ + def __init__(self, dset_name, eval_split_name, data_path=None, + desc_bert_path_or_handler=None, max_desc_len=None, + sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, + corpus_path=None, clip_length=None, + eval_proposal_bsz=None, ctx_mode="tef", data_mode="context", + h5driver=None, data_ratio=1.0, normalize_vfeat=True, + normalize_tfeat=True, max_n_proposals=90, model_type="cal"): + self.dset_name = dset_name + self.model_type = model_type + self.pool_local = model_type == "mcn" # pool local feature + self.eval_split_name = eval_split_name + self.ctx_mode = ctx_mode + self.load_gt_video = False + self.data_ratio = data_ratio # only affect query data + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + self.max_n_proposals = max_n_proposals + + self.data_mode = None + self.set_data_mode(data_mode) + + self.max_desc_len = max_desc_len + self.data_path = data_path + self.query_data = load_jsonl(data_path) + if data_ratio != 1: + n_examples = int(len(self.query_data) * data_ratio) + self.query_data = self.query_data[:n_examples] + logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) + if isinstance(desc_bert_path_or_handler, h5py.File): + self.desc_bert_h5 = desc_bert_path_or_handler + else: + self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) + + video_data = load_json(corpus_path)[self.eval_split_name] + self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] + self.video2idx = {k: v[1] for k, v in video_data.items()} + self.eval_proposal_bsz = eval_proposal_bsz + self.clip_length = clip_length + self.proposal_fn = get_proposal_interface(dset_name) + + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if self.use_video: + if isinstance(vid_feat_path_or_handler, h5py.File): + self.vid_feat_h5 = vid_feat_path_or_handler + else: # str path + self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) + + if self.use_sub: + if isinstance(sub_bert_path_or_handler, h5py.File): + self.sub_bert_h5 = sub_bert_path_or_handler + else: # str path + self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) + + def set_data_mode(self, data_mode): + """context or query""" + assert data_mode in ["context", "query"] + self.data_mode = data_mode + + def load_gt_vid_name_for_query(self, load_gt_video): + """load_gt_video: bool, affect the returned value of self._get_item_query""" + assert "vid_name" in self.query_data[0] + self.load_gt_video = load_gt_video + + def __len__(self): + if self.data_mode == "context": + return len(self.video_data) + else: + return len(self.query_data) + + def __getitem__(self, index): + if self.data_mode == "context": + return self._get_item_context(index) + else: + return self._get_item_query(index) + + def _get_item_query(self, index): + """Need to batch""" + raw_data = self.query_data[index] + + meta = dict( + desc_id=raw_data["desc_id"], + desc=raw_data["desc"], + vid_name=raw_data["vid_name"] if self.load_gt_video else None + ) + + model_inputs = dict() + query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + model_inputs["query_feat"] = torch.from_numpy(query_feat) + return dict(meta=meta, model_inputs=model_inputs) + + def _get_item_context(self, index): + """No need to batch, since it has already been batched here""" + raw_data = self.video_data[index] + + # get proposals and sort in ascending order, to get more efficient batching + proposals = self.proposal_fn( + video_id="", metadata={"duration": raw_data["duration"]}) # np.ndarray (N_p, 2) + proposals_lengths = proposals[:, 1] - proposals[:, 0] # seconds + sorted_proposal_indices = np.argsort(proposals_lengths)[:self.max_n_proposals] + sorted_proposals = proposals[sorted_proposal_indices] + + # initialize with basic data + meta = dict( + vid_name=raw_data["vid_name"], + duration=raw_data["duration"], + proposals=sorted_proposals + ) + model_inputs = dict() + + n_proposal_batches = math.ceil(1.0 * len(sorted_proposals) / self.eval_proposal_bsz) + + tef_batched_list = [None, ] * n_proposal_batches + t_moments_mask_list = [None, ] * n_proposal_batches + if self.use_tef: + tef_array = sorted_proposals / meta["duration"] # (N_p, 2) + for batch_idx in range(n_proposal_batches): + st_m_idx = batch_idx * self.eval_proposal_bsz + ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz + tef_batched_list[batch_idx] = tef_array[st_m_idx:ed_m_idx] + t_moments_mask_list[batch_idx] = \ + np.ones((len(tef_batched_list[batch_idx]), 1), dtype=np.float32) + if not self.use_video and not self.use_sub: # use video stream + model_inputs["video_moment_features_list"] = [ + ProposalRetrievalDataset.concat_feat_adv(tef=t, ctx_mode=self.ctx_mode) for t in tef_batched_list] + model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in t_moments_mask_list] + + # extract/group/pad + if self.use_video: + v_feat = self.vid_feat_h5[meta["vid_name"]] # (N_frm, D) + v_ctx_feat = np.mean(v_feat, axis=0) # (D, ) + if self.normalize_vfeat: + v_ctx_feat = l2_normalize_np_array(v_ctx_feat) + v_padded_moments_features_list, v_moments_mask_list = \ + self.get_batched_moment_feat_for_all_proposals(v_feat, sorted_proposals, + pool_local=self.pool_local, + normalize=self.normalize_vfeat) + + model_inputs["video_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv( + moment_feats=[v, v_ctx_feat], tef=t, ctx_mode=self.ctx_mode) + for v, t in zip(v_padded_moments_features_list, tef_batched_list)] + model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in v_moments_mask_list] + + if self.use_sub: + s_feat = self.sub_bert_h5[meta["vid_name"]] # (N_frm, D) + s_ctx_feat = np.mean(s_feat, axis=0) # (D, ) + if self.normalize_tfeat: + s_ctx_feat = l2_normalize_np_array(s_ctx_feat) + s_padded_moments_features_list, s_moments_mask_list = \ + self.get_batched_moment_feat_for_all_proposals(s_feat, sorted_proposals, + pool_local=self.pool_local, + normalize=self.normalize_tfeat) + model_inputs["sub_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv( + moment_feats=[s, s_ctx_feat], tef=t, ctx_mode=self.ctx_mode) + for s, t in zip(s_padded_moments_features_list, tef_batched_list)] + model_inputs["sub_moment_mask_list"] = [torch.from_numpy(e) for e in s_moments_mask_list] + return dict(meta=meta, model_inputs=model_inputs) + + def get_batched_moment_feat_for_all_proposals(self, feature, moments, pool_local=False, normalize=True): + """proposals of the same video wil be segmented into multiple batches to accomodate GPU memory + pool_local: pool local feature into a single vector + """ + n_proposal_batches = math.ceil(1.0 * len(moments) / self.eval_proposal_bsz) + padded_moments_features_list = [None, ] * n_proposal_batches + moments_mask_list = [None, ] * n_proposal_batches + moments_features = self.get_moment_feat_for_all_proposals( + feature, moments, normalize=normalize, pool_local=pool_local) # N_p * [(N_clips, D), ] + for batch_idx in range(n_proposal_batches): + st_m_idx = batch_idx * self.eval_proposal_bsz + ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz + padded_moments_features, moments_mask = \ + pad_sequences_1d(moments_features[st_m_idx:ed_m_idx], dtype=np.float32) + padded_moments_features_list[batch_idx] = padded_moments_features + moments_mask_list[batch_idx] = moments_mask + assert np.sum(np.sum(moments_mask, axis=1) == 0) == 0, " err {}".format(moments_mask) + assert np.sum(np.sum(moments_mask_list[0], axis=1) == 0) == 0, " err {}".format(moments_mask_list) + return padded_moments_features_list, moments_mask_list + + def get_moment_feat_for_all_proposals(self, vid_feat, moments, normalize=True, pool_local=False): + """Each moment is comprised of multiple clips + Args: + vid_feat: np.ndarray, (N_clips, D) + moments: np.ndarray, (N_p, 2), each row is [st (float), ed (float)], + normalize: L2 normalize + pool_local: + Returns: + moments_features: list(np.ndarray), [(N_clips, D), ] * N_p, N_clips is changing. + """ + if normalize and not pool_local: + vid_feat = l2_normalize_np_array(vid_feat) + vid_feat_len = len(vid_feat) + moments_st_clip_indices = np.floor(moments[:, 0] / self.clip_length).astype(np.int64).clip(0, vid_feat_len-1) + moments_ed_clip_indices = np.ceil(moments[:, 1] / self.clip_length).astype(np.int64).clip(1, vid_feat_len) + moments_features = [] + for st_idx, ed_idx, m in zip(moments_st_clip_indices, moments_ed_clip_indices, moments): + feat = vid_feat[st_idx:ed_idx] + if pool_local: + feat = np.mean(feat, axis=0, keepdims=True) + if normalize: + feat = l2_normalize_np_array(feat) + moments_features.append(feat) + return moments_features + + +def proposal_retrieval_collate(batch): + batch_meta = [e["meta"] for e in batch] # seems no need to collate ? + + model_inputs_keys = batch[0]["model_inputs"].keys() + batched_data = {k: pad_sequences_1d([e["model_inputs"][k] for e in batch], dtype=torch.float32) + for k in model_inputs_keys} + return batch_meta, batched_data + + +def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False): + model_inputs = {} + for k, v in batched_model_inputs.items(): + model_inputs[k] = v[0].to(device, non_blocking=non_blocking) + model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking) + return model_inputs + + +if __name__ == '__main__': + from baselines.clip_alignment_with_language.config import BaseOptions + options = BaseOptions().parse() diff --git a/baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh b/baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh new file mode 100644 index 0000000000000000000000000000000000000000..975130a0af44ee087d74baef9e89b43313a8aa9b --- /dev/null +++ b/baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# run at project root dir +dset_name=$1 # see case below +split_name=$2 # train/val/test, some datasets may not support all the 3 splits +result_dir="baselines/clip_alignment_with_language/results" + +echo "Running with dataset ${dset_name} with split ${split_name}" +case ${dset_name} in + tvr) # only supports train/val + eval_file_path=data/tvr_${split_name}_release.jsonl + save_path=${result_dir}/tvr_${split_name}_proposal_upper_bound.json + ;; + *) + echo -n "Unknown argument" + ;; +esac + +echo "Running evaluation" +python baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py \ +-dset_name=${dset_name} \ +-eval_file_path=${eval_file_path} \ +-save_path=${save_path} \ +-verbose diff --git a/baselines/clip_alignment_with_language/scripts/inference.sh b/baselines/clip_alignment_with_language/scripts/inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..3a8fd1fe8d325f53b2fcd5f5c7b550848f82af0c --- /dev/null +++ b/baselines/clip_alignment_with_language/scripts/inference.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/clip_alignment_with_language/scripts/inference.sh ANY_OTHER_PYTHON_ARGS +model_dir=$1 +eval_split_name=$2 +eval_path=data/tvr_${eval_split_name}_release.jsonl +tasks=(VR) +tasks+=(SVMR) +tasks+=(VCMR) +echo "tasks ${tasks[@]}" +python baselines/clip_alignment_with_language/inference.py \ +--model_dir ${model_dir} \ +--tasks ${tasks[@]} \ +--eval_split_name ${eval_split_name} \ +--eval_path ${eval_path} \ +${@:3} diff --git a/baselines/clip_alignment_with_language/scripts/inference_mix.sh b/baselines/clip_alignment_with_language/scripts/inference_mix.sh new file mode 100644 index 0000000000000000000000000000000000000000..3503911fecdfd3a5f2708a42041928ff94034d0b --- /dev/null +++ b/baselines/clip_alignment_with_language/scripts/inference_mix.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/clip_alignment_with_language/scripts/inference_mix.sh +eval_model=$1 # [mcn, cal], retrain models should only be paired with mee +project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval/baselines/clip_alignment_with_language/results + +# setup eval model +if [[ ${eval_model} == mcn ]]; then + pred_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40 + tef_pred_dir=tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57 +elif [[ ${eval_model} == cal ]]; then + pred_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59 + tef_pred_dir=tvr-cal-video_sub_tef-res-2019_11_05_14_25_49 +fi + +pred_path=${project_root}/${pred_dir}/inference_tvr_test_public_max200_predictions_VR_SVMR_VCMR.json +save_path=${project_root}/${pred_dir}/inference_tvr_test_public_max200_predictions_VR_SVMR_VCMR_rerank_${tef_pred_dir}.json +tef_pred_path=${project_root}/${tef_pred_dir}/inference_tvr_test_public_max10000_predictions_VCMR.pt +gt_path=data/tvr_test_public_archive.jsonl + + +python baselines/clip_alignment_with_language/mix_model_prediction.py \ +--pred_path=${pred_path} \ +--tef_pred_path=${tef_pred_path} \ +--gt_path=${gt_path} \ +--save_path=${save_path} diff --git a/baselines/clip_alignment_with_language/scripts/inference_with_external.sh b/baselines/clip_alignment_with_language/scripts/inference_with_external.sh new file mode 100644 index 0000000000000000000000000000000000000000..7695a98cac4c5cd7bd6c04edd7a07e6ba2449055 --- /dev/null +++ b/baselines/clip_alignment_with_language/scripts/inference_with_external.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/clip_alignment_with_language/scripts/inference_with_external.sh +#model_dir=$1 +# DO not use NMS, since it gives worse results +eval_model=$1 # [mcn, mcn_tef, cal, cal_tef, mcn_retrain, cal_retrain], retrain models should only be paired with mee +external_model=$2 # [mee, mcn, cal] +eval_split_name=$3 +eval_path=data/tvr_${eval_split_name}_release.jsonl +project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval/baselines + +# setup eval model +if [[ ${eval_model} == mcn ]]; then + eval_model_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40 +elif [[ ${eval_model} == mcn_tef ]]; then + eval_model_dir=tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57 +elif [[ ${eval_model} == cal ]]; then + eval_model_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59 +elif [[ ${eval_model} == cal_tef ]]; then + eval_model_dir=tvr-cal-video_sub_tef-res-2019_11_05_14_25_49 +elif [[ ${eval_model} == mcn_tef_retrain ]]; then + eval_model_dir=tvr-mcn-video_sub_tef-+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57+-2019_11_06_02_26_49 +elif [[ ${eval_model} == cal_tef_retrain ]]; then + eval_model_dir=tvr-cal-video_sub_tef-+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-cal-video_sub_tef-res-2019_11_05_14_25_49+-2019_11_06_03_12_15 +fi + +# setup external +if [[ ${external_model} == mee ]]; then + external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39 + external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json +elif [[ ${external_model} == mcn ]]; then + external_model_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40 + external_inference_vr_res_path=${project_root}/clip_alignment_with_language/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR_SVMR_VCMR.json +elif [[ ${external_model} == cal ]]; then + external_model_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59 + external_inference_vr_res_path=${project_root}/clip_alignment_with_language/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR_SVMR_VCMR.json +fi + +tasks=(VR) +tasks+=(SVMR) +tasks+=(VCMR) +echo "tasks ${tasks[@]}" +python baselines/clip_alignment_with_language/inference.py \ +--model_dir ${eval_model_dir} \ +--tasks ${tasks[@]} \ +--eval_split_name ${eval_split_name} \ +--eval_path ${eval_path} \ +--external_inference_vr_res_path ${external_inference_vr_res_path} \ +--eval_id ${external_model_dir} \ +${@:4} + +#--use_intermediate \ # temporary removed + diff --git a/baselines/clip_alignment_with_language/scripts/re_train_cal.sh b/baselines/clip_alignment_with_language/scripts/re_train_cal.sh new file mode 100644 index 0000000000000000000000000000000000000000..f436b5dbf82243e80ac3d1e036b9b3403230ab70 --- /dev/null +++ b/baselines/clip_alignment_with_language/scripts/re_train_cal.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +lr=0.00005 +n_epoch=20 +project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval +ckpt_filename="model.ckpt" +init_ckpt_path=${project_root}/baselines/clip_alignment_with_language/results/tvr-cal-video_sub_tef-res-2019_11_05_14_25_49/${ckpt_filename} +exp_id=+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-cal-video_sub_tef-res-2019_11_05_14_25_49+ +external_train_vr_res_path=${project_root}/baselines/mixture_embedding_experts/results/tvr-video_sub-res-2019_11_06_00_33_39/inference_tvr_train_None_predictions_VR.json +model_type=cal + +bash baselines/clip_alignment_with_language/scripts/train.sh tvr video_sub_tef resnet_i3d \ +--no_norm_vfeat \ +--model_type ${model_type} \ +--exp_id ${exp_id} \ +--init_ckpt_path ${init_ckpt_path} \ +--external_train_vr_res_path ${external_train_vr_res_path} \ +--lr ${lr} \ +--n_epoch ${n_epoch} \ +--max_es_cnt 5 \ +${@:1} diff --git a/baselines/clip_alignment_with_language/scripts/re_train_mcn.sh b/baselines/clip_alignment_with_language/scripts/re_train_mcn.sh new file mode 100644 index 0000000000000000000000000000000000000000..3bb3302e95edc06a3b24661d24b6a780ce58d81d --- /dev/null +++ b/baselines/clip_alignment_with_language/scripts/re_train_mcn.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +lr=0.00005 +n_epoch=20 +project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval +ckpt_filename="model.ckpt" +init_ckpt_path=${project_root}/baselines/clip_alignment_with_language/results/tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57/${ckpt_filename} +exp_id=+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57+ +external_train_vr_res_path=${project_root}/baselines/mixture_embedding_experts/results/tvr-video_sub-res-2019_11_06_00_33_39/inference_tvr_train_None_predictions_VR.json +model_type=mcn + +bash baselines/clip_alignment_with_language/scripts/train.sh tvr video_sub_tef resnet_i3d \ +--no_norm_vfeat \ +--model_type ${model_type} \ +--exp_id ${exp_id} \ +--init_ckpt_path ${init_ckpt_path} \ +--external_train_vr_res_path ${external_train_vr_res_path} \ +--lr ${lr} \ +--n_epoch ${n_epoch} \ +--max_es_cnt 5 \ +${@:1} diff --git a/baselines/clip_alignment_with_language/scripts/train.sh b/baselines/clip_alignment_with_language/scripts/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..58f1b4e0a5164835c6c4901b61a02c8da6a3d9a4 --- /dev/null +++ b/baselines/clip_alignment_with_language/scripts/train.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/clip_alignment_with_language/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS +# if re-training, please also give --init_ckpt_path and --external_train_vr_res_path, may also use lower lr ? +dset_name=$1 # see case below +ctx_mode=$2 # ["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"] +vid_feat_type=$3 # [resnet, i3d, resnet_i3d, none] , none for subtitles only models +feature_root=data/tvr_feature_release +results_root=baselines/clip_alignment_with_language/results +vid_feat_size=2048 +extra_args=() + +if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + if [[ ${dset_name} != "tvr" ]]; then + echo "The use of subtitles is only supported in tvr." + exit 1 + fi +fi + + +case ${dset_name} in + tvr) + train_path=data/tvr_train_release.jsonl + corpus_path=data/tvr_video2dur_idx.json + desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5 + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5 + clip_length=1.5 + eval_split_name=val + nms_thd=-1 + extra_args+=(--eval_path) + extra_args+=(data/tvr_val_release.jsonl) + + if [[ ${vid_feat_type} == "i3d" ]]; then + echo "Using I3D feature with shape 1024" + vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 + vid_feat_size=1024 + elif [[ ${vid_feat_type} == "resnet" ]]; then + echo "Using ResNet feature with shape 2048" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5 + vid_feat_size=2048 + elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then + echo "Using concatenated ResNet and I3D feature with shape 2048+1024" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5 + vid_feat_size=3072 + extra_args+=(--no_norm_vfeat) # since they are already normalized. + fi + + if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + echo "Running with sub." + desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5 # overwrite + sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5 + sub_feat_size=768 + extra_args+=(--sub_feat_size) + extra_args+=(${sub_feat_size}) + extra_args+=(--sub_bert_path) + extra_args+=(${sub_bert_path}) + fi + ;; + *) + echo -n "Unknown argument" + ;; +esac + +echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]" +echo "Extra args ${extra_args[@]}" +python baselines/clip_alignment_with_language/train.py \ +--dset_name=${dset_name} \ +--eval_split_name=${eval_split_name} \ +--nms_thd=${nms_thd} \ +--results_root=${results_root} \ +--train_path=${train_path} \ +--desc_bert_path=${desc_bert_path} \ +--corpus_path=${corpus_path} \ +--vid_feat_path=${vid_feat_path} \ +--clip_length=${clip_length} \ +--vid_feat_size=${vid_feat_size} \ +--ctx_mode=${ctx_mode} \ +${extra_args[@]} \ +${@:4} diff --git a/baselines/clip_alignment_with_language/train.py b/baselines/clip_alignment_with_language/train.py new file mode 100644 index 0000000000000000000000000000000000000000..62eff7b4d07cd19583c03d88602a0e96c7f64be9 --- /dev/null +++ b/baselines/clip_alignment_with_language/train.py @@ -0,0 +1,310 @@ +import os +import time +import json +import pprint +import random +import numpy as np +from collections import OrderedDict +from easydict import EasyDict as EDict +from tqdm import tqdm, trange + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from baselines.clip_alignment_with_language.config import BaseOptions +from baselines.clip_alignment_with_language.model import CALWithSub +from baselines.clip_alignment_with_language.proposal_retrieval_dataset import \ + ProposalRetrievalDataset, proposal_retrieval_collate, ProposalRetrievalEvalDataset, prepare_batch_inputs +from baselines.clip_alignment_with_language.inference import eval_epoch, start_inference +from utils.basic_utils import save_jsonl, save_json, AverageMeter +from utils.model_utils import count_parameters + + +import logging +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def set_seed(seed, use_cuda=True): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if use_cuda: + torch.cuda.manual_seed_all(seed) + + +def train_epoch(model, train_loader, optimizer, opt, epoch_i): + model.train() + + # init meters + dataloading_time = AverageMeter() + prepare_inputs_time = AverageMeter() + model_forward_time = AverageMeter() + model_backward_time = AverageMeter() + loss_meter = AverageMeter() + + num_training_examples = len(train_loader) + timer_dataloading = time.time() + for batch_idx, batch in tqdm(enumerate(train_loader), + desc="Training Iteration", + total=num_training_examples): + dataloading_time.update(time.time() - timer_dataloading) + + # continue + timer_start = time.time() + model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory) + prepare_inputs_time.update(time.time() - timer_start) + # logger.info("model_inputs {}" + # .format({k: (type(k), v.shape if isinstance(v, torch.Tensor) else v) + # for k, v in model_inputs.items()})) + # logger.info("model_inputs \n{}".format({k: (type(v), v.shape, v.dtype) for k, v in model_inputs.items()})) + timer_start = time.time() + loss = model(**model_inputs) + model_forward_time.update(time.time() - timer_start) + timer_start = time.time() + optimizer.zero_grad() + loss.backward() + if opt.grad_clip != -1: + nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) + optimizer.step() + model_backward_time.update(time.time() - timer_start) + + global_step = epoch_i * num_training_examples + batch_idx + opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step) + opt.writer.add_scalar("Train/Loss", float(loss), global_step) + loss_meter.update(float(loss)) + + timer_dataloading = time.time() + if opt.debug and batch_idx == 3: + break + to_write = opt.train_log_txt_formatter.format( + time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), + epoch=epoch_i, + loss_str=str(loss_meter.avg)) + with open(opt.train_log_filepath, "a") as f: + f.write(to_write) + print("Epoch time stats:") + print("dataloading_time: max {dataloading_time.max} " + "min {dataloading_time.min} avg {dataloading_time.avg}\n" + "prepare_inputs_time: max {prepare_inputs_time.max} " + "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n" + "model_forward_time: max {model_forward_time.max} " + "min {model_forward_time.min} avg {model_forward_time.avg}\n" + "model_backward_time: max {model_backward_time.max} " + "min {model_backward_time.min} avg {model_backward_time.avg}\n" + "".format(dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time, + model_forward_time=model_forward_time, model_backward_time=model_backward_time)) + + +def train(model, train_dataset, val_dataset, opt): + # Prepare optimizer + optimizer = torch.optim.SGD( + filter(lambda p: p.requires_grad, model.parameters()), + lr=opt.lr, + weight_decay=opt.wd, + momentum=opt.momentum) + # reduce the lr by 0.1 every 30 epochs + scheduler = torch.optim.lr_scheduler.StepLR( + optimizer, + step_size=30, + gamma=0.1 + ) + + train_loader = DataLoader(train_dataset, + collate_fn=proposal_retrieval_collate, + batch_size=opt.bsz, + num_workers=opt.num_workers, + shuffle=True, + pin_memory=opt.pin_memory) + + prev_best_score = 0. + es_cnt = 0 + start_epoch = -1 if opt.eval_untrained else 0 + eval_tasks_at_training = ["SVMR", ] + save_submission_filename = \ + "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, "_".join(eval_tasks_at_training)) + for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"): + if epoch_i > -1: + with torch.autograd.detect_anomaly(): + train_epoch(model, train_loader, optimizer, opt, epoch_i) + global_step = (epoch_i + 1) * len(train_loader) + scheduler.step() + if opt.eval_path is not None: + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, val_dataset, opt, save_submission_filename, tasks=eval_tasks_at_training, + max_before_nms=300, max_after_nms=100) + logger.info("metrics_no_nms {}".format( + pprint.pformat(rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + to_write = opt.eval_log_txt_formatter.format( + time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), + epoch=epoch_i, + eval_metrics_str=json.dumps(metrics_no_nms)) + with open(opt.eval_log_filepath, "a") as f: + f.write(to_write) + + # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms + metrics = metrics_no_nms + # early stop/ log / save model + for task_type, task_metrics in metrics.items(): + for iou_thd in [0.5, 0.7]: + opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd), + {k: v for k, v in task_metrics.items() if str(iou_thd) in k}, + global_step) + + # use the most strict metric available + if metrics["SVMR"]["0.5-r1"] > prev_best_score: + es_cnt = 0 + prev_best_score = metrics["SVMR"]["0.5-r1"] + + checkpoint = { + "model": model.state_dict(), + "model_cfg": model.config, + "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + + best_file_paths = [e.replace("latest", "best") for e in latest_file_paths] + for src, tgt in zip(latest_file_paths, best_file_paths): + os.renames(src, tgt) + logger.info("The checkpoint file has been updated.") + else: + es_cnt += 1 + if es_cnt > opt.max_es_cnt: # early stop + with open(opt.train_log_filepath, "a") as f: + f.write("Early Stop at epoch {}".format(epoch_i)) + logger.info("Early stop at {} with SVMR 0.5-r1 {}".format(epoch_i, prev_best_score)) + break + else: + checkpoint = { + "model": model.state_dict(), + "model_cfg": model.config, + "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + + if opt.debug: + break + + opt.writer.close() + + +def rm_key_from_odict(odict_obj, rm_suffix): + """remove key entry from the OrderedDict""" + return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k]) + + +def start_training(): + logger.info("Setup config, data and model...") + opt = BaseOptions().parse() + set_seed(opt.seed) + if opt.debug: # keep the model run deterministically + # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config. + # Enable this only when input size is fixed. + cudnn.benchmark = False + cudnn.deterministic = True + + opt.writer = SummaryWriter(opt.tensorboard_log_dir) + opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" + opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" + + train_dataset = ProposalRetrievalDataset( + dset_name=opt.dset_name, + model_type=opt.model_type, + data_path=opt.train_path, + desc_bert_path=opt.desc_bert_path, + sub_bert_path=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + vid_feat_path=opt.vid_feat_path, + clip_length=opt.clip_length, + vid_feat_size=opt.vid_feat_size, + sub_feat_size=opt.sub_feat_size, + ctx_mode=opt.ctx_mode, + pos_iou_thd=opt.pos_iou_thd, + neg_iou_thd=opt.neg_iou_thd, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + external_train_vr_res_path=opt.external_train_vr_res_path, # If not None, used to guide negative sampling + corpus_path=opt.corpus_path, + ) + + if opt.eval_path is not None: + eval_dataset = ProposalRetrievalEvalDataset( + dset_name=opt.dset_name, + model_type=opt.model_type, + eval_split_name=opt.eval_split_name, # should only be val set + data_path=opt.eval_path, + desc_bert_path_or_handler=train_dataset.desc_bert_h5, + sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, + max_desc_len=opt.max_desc_l, + corpus_path=opt.corpus_path, + vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, + clip_length=opt.clip_length, + eval_proposal_bsz=opt.eval_proposal_bsz, + ctx_mode=opt.ctx_mode, + data_mode="query", + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + ) + else: + eval_dataset = None + + model_config = EDict( + visual_input_size=train_dataset.vid_feat_output_size, # changes based on visual input type + textual_input_size=train_dataset.sub_feat_output_size, + query_feat_size=opt.desc_feat_size, + visual_hidden_size=opt.visual_hidden_size, # + output_size=opt.output_size, + embedding_size=opt.embedding_size, + lstm_hidden_size=opt.lstm_hidden_size, + margin=opt.margin, # margin for ranking loss + loss_type=opt.loss_type, # loss type, 'hinge' or 'lse' + inter_loss_weight=opt.inter_loss_weight * (opt.ctx_mode == "tef"), # weight for inter negatives + ctx_mode=opt.ctx_mode + ) + logger.info("model_config {}".format(model_config)) + + model = CALWithSub(model_config) + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + + if opt.init_ckpt_path is not None: + checkpoint = torch.load(opt.init_ckpt_path) + model.load_state_dict(checkpoint["model"]) + logger.info("Loaded model saved at epoch {} from checkpoint: {}" + .format(checkpoint["epoch"], opt.init_ckpt_path)) + count_parameters(model) + + logger.info("Start Training...") + train(model, train_dataset, eval_dataset, opt) + return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug + + +if __name__ == '__main__': + model_dir, eval_split_name, eval_path, debug = start_training() + if not debug: + model_dir = model_dir.split(os.sep)[-1] + tasks = ["SVMR", "VCMR"] + input_args = ["--model_dir", model_dir, + "--eval_split_name", eval_split_name, + "--eval_path", eval_path, + "--tasks"] + tasks + + import sys + sys.argv[1:] = input_args + logger.info("\n\n\nFINISHED TRAINING!!!") + logger.info("Evaluating model in {}".format(model_dir)) + start_inference() diff --git a/baselines/crossmodal_moment_localization/README.md b/baselines/crossmodal_moment_localization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b874d87a02e9fe36f94990f40ffce3ce88179f02 --- /dev/null +++ b/baselines/crossmodal_moment_localization/README.md @@ -0,0 +1,2 @@ +Cross-modal Moment Localization (XML) +=== diff --git a/baselines/crossmodal_moment_localization/__init__.py b/baselines/crossmodal_moment_localization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ba4cafc7d04c6b82486c7a119c33630d2ad83a4 Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..482ff866e0d76733807545a9d58b54c9daba0f93 Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b27b1c7e1415b2358016a7c51a473e7907cb9acd Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..489798cd6c70dc06d0b027322c8fb85d447fdccb Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9b751ffbbf627a048418128389bae734c57a7a2 Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc02db4509d07b3b24453b3100b0eae1d786dde7 Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70ad01e2dc536a454283db9b0f2569ba53470534 Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5be5518699f22351ceaed33a41bf824f606ab5e1 Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc differ diff --git a/baselines/crossmodal_moment_localization/config.py b/baselines/crossmodal_moment_localization/config.py new file mode 100644 index 0000000000000000000000000000000000000000..53a514d0b3837d11235d911904bf13594edd296c --- /dev/null +++ b/baselines/crossmodal_moment_localization/config.py @@ -0,0 +1,276 @@ +import os +import time +import torch +import argparse + +from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile +from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs + + +class BaseOptions(object): + saved_option_filename = "opt.json" + ckpt_filename = "model.ckpt" + tensorboard_log_dir = "tensorboard_log" + train_log_filename = "train.log.txt" + eval_log_filename = "eval.log.txt" + + def __init__(self): + self.parser = argparse.ArgumentParser() + self.initialized = False + self.opt = None + + def initialize(self): + self.initialized = True + self.parser.add_argument("--dset_name", type=str, choices=["tvr"]) + self.parser.add_argument("--model_name", type=str) + self.parser.add_argument("--eval_split_name", type=str, default="val", + help="should match keys in corpus_path, must set for VCMR") + self.parser.add_argument("--debug", action="store_true", + help="debug (fast) mode, break all loops, do not load all data into memory.") + self.parser.add_argument("--data_ratio", type=float, default=1.0, + help="how many training and eval data to use. 1.0: use all, 0.1: use 10%." + "Use small portion for debug purposes. Note this is different from --debug, " + "which works by breaking the loops, typically they are not used together.") + self.parser.add_argument("--results_root", type=str, default="results") + self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training") + self.parser.add_argument("--seed", type=int, default=2018, help="random seed") + self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu") + self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job") + self.parser.add_argument("--num_workers", type=int, default=4, + help="num subprocesses used to load the data, 0: use main process") + self.parser.add_argument("--no_core_driver", action="store_true", + help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`") + self.parser.add_argument("--no_pin_memory", action="store_true", + help="Don't use pin_memory=True for dataloader. " + "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4") + + # training config + self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate") + self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay") + self.parser.add_argument("--n_epoch", type=int, default=100, help="number of epochs to run") + self.parser.add_argument("--max_es_cnt", type=int, default=10, + help="number of epochs to early stop, use -1 to disable early stop") + self.parser.add_argument("--stop_task", type=str, default="VCMR", choices=["VCMR", "SVMR", "VR"], + help="Use metric associated with stop_task for early stop") + self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+", + default=["VCMR"], choices=["VCMR", "SVMR", "VR"], + help="evaluate and report numbers for tasks specified here.") + self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size") + self.parser.add_argument("--eval_query_bsz", type=int, default=50, + help="mini-batch size at inference, for query") + self.parser.add_argument("--eval_context_bsz", type=int, default=200, + help="mini-batch size at inference, for video/sub") + self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model") + self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable") + self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss") + self.parser.add_argument("--lw_neg_q", type=float, default=1, + help="weight for ranking loss with negative query and positive context") + self.parser.add_argument("--lw_neg_ctx", type=float, default=1, + help="weight for ranking loss with positive query and negative context") + self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss") + self.parser.add_argument("--train_span_start_epoch", type=int, default=0, + help="which epoch to start training span prediction, -1 to disable") + self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"], + help="att loss type, can be hinge loss or its smooth approximation LogSumExp") + self.parser.add_argument("--hard_negtiave_start_epoch", type=int, default=20, + help="which epoch to start hard negative sampling for video-level ranking loss," + "use -1 to disable") + self.parser.add_argument("--hard_pool_size", type=int, default=20, + help="hard negatives are still sampled, but from a harder pool.") + + # Model and Data config + self.parser.add_argument("--max_sub_l", type=int, default=50, + help="max length of all sub sentence 97.71 under 50 for 3 sentences") + self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions") + self.parser.add_argument("--max_ctx_l", type=int, default=100, + help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100") + + self.parser.add_argument("--train_path", type=str, default=None) + self.parser.add_argument("--val_path", type=str, default=None) + self.parser.add_argument("--test_path", type=str, default=None) + self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None, + help="if set, use external video retrieval results to guide evaluation. ") + self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features") + self.parser.add_argument("--word2idx_path", type=str, + help="a dict, {word: word_idx, ...}, " + "special tokens are {: 0, : 1, : 2}") + self.parser.add_argument("--vocab_size", type=int, default=-1, + help="Set automatically to len(word2idx)") + self.parser.add_argument("--glove_path", type=str, + help="path to file containing the GloVe embeddings for words in word2idx") + self.parser.add_argument("--desc_bert_path", type=str, default=None) + self.parser.add_argument("--sub_bert_path", type=str, default=None) + self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature") + self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature") + self.parser.add_argument("--ctx_mode", type=str, choices=["video", "sub", "video_sub", "tef", + "video_tef", "sub_tef", "video_sub_tef"], + help="which context to use. a combination of [video, sub, tef]") + self.parser.add_argument("--corpus_path", type=str, default=None) + self.parser.add_argument("--vid_feat_path", type=str, default="") + self.parser.add_argument("--no_norm_vfeat", action="store_true", + help="Do not do normalization on video feat, use it only when using resnet_i3d feat") + self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat") + self.parser.add_argument("--clip_length", type=float, default=None, + help="each video will be uniformly segmented into small clips, " + "will automatically loaded from ProposalConfigs if None") + self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature") + + self.parser.add_argument("--span_predictor_type", type=str, default="conv", choices=["conv", "cat_linear"], + help="how to generate span predictions, " + "conv: apply 1D-Conv layer on top of NxL dot product of query and clips" + "cat_linear: cat the query and clips then use a linear layer to give output. " + "Note cat_linear is implemented as first project query and clips into scores, " + "separately, then sum them up, this should be similar to first cat then project.") + self.parser.add_argument("--stack_conv_predictor_conv_kernel_sizes", type=int, default=-1, nargs="+", + help="combine the results from conv edge detectors of all sizes specified." + "-1: disable. If specified, will ignore --conv_kernel_size option." + "This flag is only used when --merge_two_stream and --span_predictor_type conv!") + self.parser.add_argument("--encoder_type", type=str, default="transformer", + choices=["gru", "lstm", "transformer", "cnn"]) + self.parser.add_argument("--add_pe_rnn", action="store_true", + help="Add positional encoding for GRU and LSTM encoder as well") + self.parser.add_argument("--no_merge_two_stream", action="store_true", help="do not merge video and subtitles") + self.parser.add_argument("--no_cross_att", action="store_true", + help="Use cross-attention for modeling video and subtitles") + self.parser.add_argument("--no_self_att", action="store_true", help="do not use self attention") + self.parser.add_argument("--no_modular", action="store_true", help="do not use modular attention") + self.parser.add_argument("--pe_type", type=str, default="cosine", choices=["none", "linear", "cosine"], + help="Only for query encoding") + self.parser.add_argument("--max_position_embeddings", type=int, default=300) + self.parser.add_argument("--hidden_size", type=int, default=256) + self.parser.add_argument("--n_heads", type=int, default=4) + self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs") + self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers") + self.parser.add_argument("--cross_att_drop", type=float, default=0.1, help="Applied to cross-att") + self.parser.add_argument("--conv_kernel_size", type=int, default=5) + self.parser.add_argument("--conv_stride", type=int, default=1) + self.parser.add_argument("--initializer_range", type=float, default=0.02, + help="initializer range for linear layer") + self.parser.add_argument("--eval_num_per_epoch", type=float) + + # post processing + self.parser.add_argument("--min_pred_l", type=int, default=2, + help="constrain the [st, ed] with ed - st >= 2" + "(2 clips with length 1.5 each, 3 secs in total" + "this is the min length for proposal-based method)") + self.parser.add_argument("--max_pred_l", type=int, default=16, + help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total" + "(16 clips with length 1.5 each, " + "this is the max length for proposal-based method)") + self.parser.add_argument("--q2c_alpha", type=float, default=20, + help="give more importance to top scored videos' spans, " + "the new score will be: s_new = exp(alpha * s), " + "higher alpha indicates more importance. Note s in [-1, 1]") + + self.parser.add_argument("--max_before_nms", type=int, default=200) + self.parser.add_argument("--max_vcmr_video", type=int, default=100, + help="re-ranking in top-max_vcmr_video") + self.parser.add_argument("--nms_thd", type=float, default=-1, + help="additionally use non-maximum suppression " + "(or non-minimum suppression for distance)" + "to post-processing the predictions. " + "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,") + + def display_save(self, opt): + args = vars(opt) + # Display settings + print("------------ Options -------------\n{}\n-------------------" + .format({str(k): str(v) for k, v in sorted(args.items())})) + + # Save settings + if not isinstance(self, TestOptions): + option_file_path = os.path.join(opt.results_dir, self.saved_option_filename) # not yaml file indeed + save_json(args, option_file_path, save_pretty=True) + + def parse(self): + if not self.initialized: + self.initialize() + opt = self.parser.parse_args() + + if opt.debug: + opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ]) + opt.no_core_driver = True + opt.num_workers = 0 + opt.eval_query_bsz = 100 + + if isinstance(self, TestOptions): + # modify model_dir to absolute path + opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir) + saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename)) + for arg in saved_options: # use saved options to overwrite all BaseOptions args. + if arg not in ["results_root", "num_workers", "nms_thd", "debug", + "eval_split_name", "eval_path", "eval_query_bsz", "eval_context_bsz", + "max_pred_l", "min_pred_l", "external_inference_vr_res_path"]: + setattr(opt, arg, saved_options[arg]) + # opt.no_core_driver = True + else: + if opt.exp_id is None: + raise ValueError("--exp_id is required for at a training option!") + + if opt.clip_length is None: + opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"] + print("Loaded clip_length {} from proposal config file".format(opt.clip_length)) + opt.results_dir = os.path.join(opt.results_root, "_".join([opt.model_name, opt.exp_id, time.strftime("%Y%m%d_%H%M%S")])) + mkdirp(opt.results_dir) + # save a copy of current code + code_dir = os.path.dirname(os.path.realpath(__file__)) + code_zip_filename = os.path.join(opt.results_dir, "code.zip") + make_zipfile(code_dir, code_zip_filename, + enclosing_dir="code", + exclude_dirs_substring="results", + exclude_dirs=["results", "debug_results", "__pycache__"], + exclude_extensions=[".pyc", ".ipynb", ".swap"],) + + self.display_save(opt) + + if "sub" in opt.ctx_mode: + assert opt.dset_name == "tvr", "sub is only supported for tvr dataset" + + if opt.hard_negtiave_start_epoch != -1: + if opt.hard_pool_size > opt.bsz: + print("[WARNING] hard_pool_size is larger than bsz") + + assert opt.stop_task in opt.eval_tasks_at_training + opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) + opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) + opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) + opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) + opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") + opt.h5driver = None if opt.no_core_driver else "core" + # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5 + opt.num_workers = 1 if opt.no_core_driver else opt.num_workers + opt.pin_memory = not opt.no_pin_memory + + if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d + assert opt.no_norm_vfeat + + if "tef" in opt.ctx_mode and "video" in opt.ctx_mode: + opt.vid_feat_size += 2 + if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode: + opt.sub_feat_size += 2 + + if "video" not in opt.ctx_mode or "sub" not in opt.ctx_mode: + opt.no_merge_two_stream = True + opt.no_cross_att = True + + self.opt = opt + return opt + + +class TestOptions(BaseOptions): + """add additional options for evaluating""" + def initialize(self): + BaseOptions.initialize(self) + # also need to specify --eval_split_name + self.parser.add_argument("--eval_id", type=str, help="evaluation id") + self.parser.add_argument("--model_dir", type=str, + help="dir contains the model file, will be converted to absolute path afterwards") + self.parser.add_argument("--tasks", type=str, nargs="+", + choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"], + help="Which tasks to run." + "VCMR: Video Corpus Moment Retrieval;" + "SVMR: Single Video Moment Retrieval;" + "VR: regular Video Retrieval. (will be performed automatically with VCMR)") diff --git a/baselines/crossmodal_moment_localization/inference.py b/baselines/crossmodal_moment_localization/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..5446f664c33332a00836aad17f285b0b5d9a2b9f --- /dev/null +++ b/baselines/crossmodal_moment_localization/inference.py @@ -0,0 +1,414 @@ +import os +import copy +import math +import time +import pprint +from tqdm import tqdm, trange +import numpy as np + +import torch +import torch.nn.functional as F +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader + +from baselines.crossmodal_moment_localization.config import TestOptions +from baselines.crossmodal_moment_localization.model_xml import XML +from baselines.crossmodal_moment_localization.start_end_dataset import \ + start_end_collate, StartEndEvalDataset, prepare_batch_inputs +from baselines.clip_alignment_with_language.inference import \ + get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms +from utils.basic_utils import save_json, load_json +from utils.tensor_utils import find_max_triples_from_upper_triangle_product +from standalone_eval.eval import eval_retrieval + +import logging +from ndcg_iou_topk import calculate_ndcg_iou + + + + +def compute_context_info(model, eval_dataset, opt): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated 2200 (videos) * 100 (frm) * 500 (hsz) * 4 (B) * 2 (video/sub) * 2 (layers) / (1024 ** 2) = 1.76 GB + max_n_videos: only consider max_n_videos videos for each query to return st_ed scores. + """ + model.eval() + # eval_dataset.set_data_mode("context") + context_dataloader = DataLoader(eval_dataset, + collate_fn=start_end_collate, + batch_size=opt.eval_context_bsz, + num_workers=opt.num_workers, + shuffle=False, + pin_memory=opt.pin_memory) + + metas = [] # list(dicts) + video_feat1 = [] + video_feat2 = [] + video_mask = [] + sub_feat1 = [] + sub_feat2 = [] + sub_mask = [] + for idx, batch in tqdm(enumerate(context_dataloader), + desc="Computing query2video scores", + total=len(context_dataloader)): + metas.extend(batch[0]) + model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) + + _video_feat1, _video_feat2, _sub_feat1, _sub_feat2 = model.encode_context( + model_inputs["video_feat"], model_inputs["video_mask"], + model_inputs["sub_feat"], model_inputs["sub_mask"]) + if "video" in opt.ctx_mode: + video_feat1.append(_video_feat1) + video_feat2.append(_video_feat2) + video_mask.append(model_inputs["video_mask"]) + if "sub" in opt.ctx_mode: + sub_feat1.append(_sub_feat1) + sub_feat2.append(_sub_feat2) + sub_mask.append(model_inputs["sub_mask"]) + + def cat_tensor(tensor_list): + if len(tensor_list) == 0: + return None + else: + seq_l = [e.shape[1] for e in tensor_list] + b_sizes = [e.shape[0] for e in tensor_list] + b_sizes_cumsum = np.cumsum([0] + b_sizes) + if len(tensor_list[0].shape) == 3: + hsz = tensor_list[0].shape[2] + res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l), hsz) + elif len(tensor_list[0].shape) == 2: + res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l)) + else: + raise ValueError("Only support 2/3 dimensional tensors") + for i, e in enumerate(tensor_list): + res_tensor[b_sizes_cumsum[i]:b_sizes_cumsum[i+1], :seq_l[i]] = e + return res_tensor + + return metas, dict( + video_feat1=cat_tensor(video_feat1), # (N_videos, L, hsz), + video_feat2=cat_tensor(video_feat2), + video_mask=cat_tensor(video_mask), # (N_videos, L) + sub_feat1=cat_tensor(sub_feat1), + sub_feat2=cat_tensor(sub_feat2), + sub_mask=cat_tensor(sub_mask), + ) + + +def index_if_not_none(input_tensor, indices): + if input_tensor is None: + return input_tensor + else: + return input_tensor[indices] + + + + +def generate_min_max_length_mask(array_shape, min_l, max_l): + """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked, + below is the case for 4x4. + [[0, 1, 1, 0], + [0, 0, 1, 1], + [0, 0, 0, 1], + [0, 0, 0, 0]] + + Args: + array_shape: np.shape??? The last two dimensions should be the same + min_l: int, minimum length of predicted span + max_l: int, maximum length of predicted span + + Returns: + + """ + single_dims = (1, ) * (len(array_shape) - 2) + mask_shape = single_dims + array_shape[-2:] + extra_length_mask_array = np.ones(mask_shape, dtype=np.float32) # (1, ..., 1, L, L) + mask_triu = np.triu(extra_length_mask_array, k=min_l) + mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l) + final_prob_mask = mask_triu * mask_triu_reversed + return final_prob_mask # with valid bit to be 1 + + +def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx, + clip_length, min_pred_l, max_pred_l, max_before_nms): + """ + Args: + svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1] + svmr_gt_ed_probs: + query_metas: + video2idx: + clip_length: float, how long each clip is in seconds + min_pred_l: int, minimum number of clips + max_pred_l: int, maximum number of clips + max_before_nms: get top-max_before_nms predictions for each query + + Returns: + + """ + svmr_res = [] + query_vid_names = [e["vid_name"] for e in query_metas] + + # masking very long ones! Since most are relatively short. + st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs) # (N, L, L) + # extra_length_mask_array = np.ones(st_ed_prob_product.shape, dtype=bool) # (N, L, L) + # mask_triu = np.triu(extra_length_mask_array, k=min_pred_l) + # mask_triu_reversed = np.logical_not(np.triu(extra_length_mask_array, k=max_pred_l)) + # final_prob_mask = np.logical_and(mask_triu, mask_triu_reversed) # with valid bit to be 1 + valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l) + st_ed_prob_product *= valid_prob_mask # invalid location will become zero! + + batched_sorted_triples = find_max_triples_from_upper_triangle_product( + st_ed_prob_product, top_n=max_before_nms, prob_thd=None) + for i, q_vid_name in tqdm(enumerate(query_vid_names), + desc="[SVMR] Loop over queries to generate predictions", + total=len(query_vid_names)): # i is query_id + q_m = query_metas[i] + video_idx = video2idx[q_vid_name] + _sorted_triples = batched_sorted_triples[i] + _sorted_triples[:, 1] += 1 # as we redefined ed_idx, which is inside the moment. + _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length + # [video_idx(int), st(float), ed(float), score(float)] + cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()] + cur_query_pred = dict( + query_id=q_m["query_id"], + desc=q_m["desc"], + predictions=cur_ranked_predictions + ) + svmr_res.append(cur_query_pred) + return svmr_res + + +def load_external_vr_res2(external_vr_res_path, top_n_vr_videos=5): + """return a mapping from query_id to top retrieved video info""" + external_vr_res = load_json(external_vr_res_path) + external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"] + query2video = {e["query_id"]: e["predictions"] for e in external_vr_res} + return query2video + + +def compute_query2ctx_info(model, eval_dataset, opt, video_metas, ctx_info, + max_before_nms=1000, max_n_videos=100, maxtopk=40): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB + max_n_videos: int, use max_n_videos videos for computing VCMR/VR results + """ + + video2idx = eval_dataset.video2idx + # video_metas = ctx_info["video_metas"] + if opt.external_inference_vr_res_path is not None: + video_idx2meta_idx = {video2idx[m["vid_name"]]: i for i, m in enumerate(video_metas)} + external_query2video = \ + load_external_vr_res2(opt.external_inference_vr_res_path, top_n_vr_videos=max_n_videos) + # 「query idx: [video meta idx]」 + external_query2video_meta_idx = \ + {k: [video_idx2meta_idx[e[0]] for e in v] for k, v in external_query2video.items()} + else: + external_query2video = None + external_query2video_meta_idx = None + + model.eval() + eval_dataset.set_data_mode("query") + # eval_dataset.load_gt_vid_name_for_query(is_svmr) + query_eval_loader = DataLoader(eval_dataset, + collate_fn=start_end_collate, + batch_size=opt.eval_query_bsz, + num_workers=opt.num_workers, + shuffle=False, + pin_memory=opt.pin_memory) + n_total_videos = len(video_metas) + n_total_query = len(eval_dataset) + bsz = opt.eval_query_bsz + + flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int) + flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32) + sorted_q2c_indices = np.empty((n_total_query, max_n_videos), dtype=int) + sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32) + + + query_metas = [] + for idx, batch in tqdm( + enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)): + _query_metas = batch[0] + query_metas.extend(batch[0]) + model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) + # query_context_scores (_N_q, N_videos), st_prob, ed_prob (_N_q, N_videos, L) + _query_context_scores, _st_probs, _ed_probs = \ + model.get_pred_from_raw_query(model_inputs["query_feat"], model_inputs["query_mask"], + ctx_info["video_feat1"], ctx_info["video_feat2"], + ctx_info["video_mask"], + ctx_info["sub_feat1"], ctx_info["sub_feat2"], + ctx_info["sub_mask"], + cross=True) + # _query_context_scores = _query_context_scores + 1 # move cosine similarity to [0, 2] + # To give more importance to top scores, the higher opt.alpha is the more importance will be given + _query_context_scores = torch.exp(opt.q2c_alpha * _query_context_scores) + + # normalize to get true probabilities!!! + # the probabilities here are already (pad) masked, so only need to do softmax + _st_probs = F.softmax(_st_probs, dim=-1) # (_N_q, N_videos, L) + _ed_probs = F.softmax(_ed_probs, dim=-1) + + if external_query2video is None: + _sorted_q2c_scores, _sorted_q2c_indices = \ + torch.topk(_query_context_scores, max_n_videos, dim=1, largest=True) + else: + relevant_video_info = [external_query2video[qm["query_id"]] for qm in _query_metas] + _sorted_q2c_indices = _query_context_scores.new( + [[video_idx2meta_idx[sub_e[0]] for sub_e in e] for e in relevant_video_info]).long() + _sorted_q2c_scores = _query_context_scores.new( + [[sub_e[3] for sub_e in e] for e in relevant_video_info]) + _sorted_q2c_scores = torch.exp(opt.q2c_alpha * _sorted_q2c_scores) + # collect data for vr and vcmr + sorted_q2c_indices[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_indices.cpu().numpy() + sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_scores.cpu().numpy() + + + # Get VCMR results + # compute combined scores + row_indices = torch.arange(0, len(_st_probs), device=opt.device).unsqueeze(1) + _st_probs = _st_probs[row_indices, _sorted_q2c_indices] # (_N_q, max_n_videos, L) + _ed_probs = _ed_probs[row_indices, _sorted_q2c_indices] + + # (_N_q, max_n_videos, L, L) + _st_ed_scores = torch.einsum("qvm,qv,qvn->qvmn", _st_probs, _sorted_q2c_scores, _ed_probs) + valid_prob_mask = generate_min_max_length_mask( + _st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l) + _st_ed_scores *= torch.from_numpy( + valid_prob_mask).to(_st_ed_scores.device) # invalid location will become zero! + + # sort across the top-max_n_videos videos (by flatten from the 2nd dim) + # the indices here are local indices, not global indices + _n_q = _st_ed_scores.shape[0] + _flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1) # (N_q, max_n_videos*L*L) + _flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \ + torch.sort(_flat_st_ed_scores, dim=1, descending=True) + # collect data + flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \ + _flat_st_ed_sorted_scores[:, :max_before_nms].cpu().numpy() + flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \ + _flat_st_ed_scores_sorted_indices[:, :max_before_nms].cpu().numpy() + + if opt.debug: + break + + + vcmr_res = {} + for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm( + enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)), + desc="[VCMR] Loop over queries to generate predictions", total=n_total_query): # i is query_idx + # list([video_idx(int), st(float), ed(float), score(float)]) + video_meta_indices_local, pred_st_indices, pred_ed_indices = \ + np.unravel_index(_flat_st_ed_scores_sorted_indices, + shape=(max_n_videos, opt.max_ctx_l, opt.max_ctx_l)) + # video_meta_indices_local refers to the indices among the top-max_n_videos + # video_meta_indices refers to the indices in all the videos, which is the True indices + video_meta_indices = sorted_q2c_indices[i, video_meta_indices_local] + + pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length + pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length + cur_vcmr_redictions = [] + for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices, _flat_st_ed_sorted_scores)): # videos + video_idx = video2idx[video_metas[v_meta_idx]["vid_name"]] + cur_vcmr_redictions.append( + { + "video_name": video_metas[v_meta_idx]["vid_name"], + "timestamp": [float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j])], + "model_scores": float(v_score) + } + ) + query_id=query_metas[i]["query_id"] + vcmr_res[query_id] = cur_vcmr_redictions[:maxtopk] + return vcmr_res + + +def get_eval_res(model, eval_dataset, context_data, opt, maxtopk): + """compute and save query and video proposal embeddings""" + + video_metas, context_info = compute_context_info(model, context_data, opt) + eval_res = compute_query2ctx_info(model, eval_dataset, opt, video_metas, context_info, + max_before_nms=opt.max_before_nms, max_n_videos=opt.max_vcmr_video, maxtopk=maxtopk) + return eval_res + + +POST_PROCESSING_MMS_FUNC = { + "SVMR": post_processing_svmr_nms, + "VCMR": post_processing_vcmr_nms +} + +# def get_prediction_top_n(list_dict_predictions, top_n): +# top_n_res = [] +# for e in list_dict_predictions: +# e["predictions"] = e["predictions"][:top_n] +# top_n_res.append(e) +# return top_n_res + + +def eval_epoch(model, eval_dataset, context_data, logger, opt, max_after_nms, iou_thds, topks): + """max_after_nms: always set to 100, since the eval script only evaluate top-100""" + # IOU_THDS = (0.3, 0.5, 0.7) + + model.eval() + pred_data = get_eval_res(model, eval_dataset, context_data, opt, max(topks)) + # pred_data = get_prediction_top_n(eval_res, top_n=max_after_nms) + gt_data = eval_dataset.ground_truth + average_ndcg = calculate_ndcg_iou(gt_data, pred_data, iou_thds, topks) + return average_ndcg, pred_data + +def setup_model(opt): + """Load model from checkpoint and move to specified device""" + checkpoint = torch.load(opt.ckpt_filepath) + loaded_model_cfg = checkpoint["model_cfg"] + loaded_model_cfg["stack_conv_predictor_conv_kernel_sizes"] = -1 + model = XML(loaded_model_cfg) + model.load_state_dict(checkpoint["model"]) + logger.info("Loaded model saved at epoch {} from checkpoint: {}" + .format(checkpoint["epoch"], opt.ckpt_filepath)) + + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + return model + + +def start_inference(): + logger.info("Setup config, data and model...") + opt = TestOptions().parse() + cudnn.benchmark = False + cudnn.deterministic = True + + assert opt.eval_path is not None + eval_dataset = StartEndEvalDataset( + dset_name=opt.dset_name, + eval_split_name=opt.eval_split_name, # should only be val set + data_path=opt.eval_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + corpus_path=opt.corpus_path, + vid_feat_path_or_handler=opt.vid_feat_path, + clip_length=opt.clip_length, + ctx_mode=opt.ctx_mode, + data_mode="query", + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat + ) + + model = setup_model(opt) + save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format( + opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks)) + logger.info("Starting inference...") + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=opt.tasks, max_after_nms=100) + logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + +if __name__ == '__main__': + start_inference() diff --git a/baselines/crossmodal_moment_localization/model_components.py b/baselines/crossmodal_moment_localization/model_components.py new file mode 100644 index 0000000000000000000000000000000000000000..4ab6ba7d99e105c489089877a1f5ef7d630a5f41 --- /dev/null +++ b/baselines/crossmodal_moment_localization/model_components.py @@ -0,0 +1,317 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class DepthwiseSeparableConv(nn.Module): + """ + Depth-wise separable convolution uses less parameters to generate output by convolution. + :Examples: + >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1) + >>> input_tensor = torch.randn(32, 300, 20) + >>> output = m(input_tensor) + """ + + def __init__(self, in_ch, out_ch, k, dim=1, relu=True): + """ + :param in_ch: input hidden dimension size + :param out_ch: output hidden dimension size + :param k: kernel size + :param dim: default 1. 1D conv or 2D conv + """ + super(DepthwiseSeparableConv, self).__init__() + self.relu = relu + if dim == 1: + self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch, + kernel_size=k, groups=in_ch, padding=k//2) + self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch, + kernel_size=1, padding=0) + elif dim == 2: + self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch, + kernel_size=k, groups=in_ch, padding=k//2) + self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch, + kernel_size=1, padding=0) + else: + raise Exception("Incorrect dimension!") + + def forward(self, x): + """ + :Input: (N, L_in, D) + :Output: (N, L_out, D) + """ + x = x.transpose(1, 2) + if self.relu: + out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True) + else: + out = self.pointwise_conv(self.depthwise_conv(x)) + return out.transpose(1, 2) # (N, L, D) + + +class ConvEncoder(nn.Module): + def __init__(self, kernel_size=7, n_filters=128, dropout=0.1): + super(ConvEncoder, self).__init__() + self.dropout = nn.Dropout(dropout) + self.layer_norm = nn.LayerNorm(n_filters) + self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True) + + def forward(self, x, mask): + """ + :param x: (N, L, D) + :param mask: (N, L), is not used. + :return: (N, L, D) + """ + return self.layer_norm(self.dropout(self.conv(x)) + x) # (N, L, D) + + +class TrainablePositionalEncoding(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, max_position_embeddings, hidden_size, dropout=0.1): + super(TrainablePositionalEncoding, self).__init__() + self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) + self.LayerNorm = nn.LayerNorm(hidden_size) + self.dropout = nn.Dropout(dropout) + + def forward(self, input_feat): + """ + Args: + input_feat: (N, L, D) + """ + bsz, seq_length = input_feat.shape[:2] + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device) + position_ids = position_ids.unsqueeze(0).repeat(bsz, 1) # (N, L) + + position_embeddings = self.position_embeddings(position_ids) + + embeddings = self.LayerNorm(input_feat + position_embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class PositionEncoding(nn.Module): + """ + Add positional information to input tensor. + :Examples: + >>> model = PositionEncoding(n_filters=6, max_len=10) + >>> test_input1 = torch.zeros(3, 10, 6) + >>> output1 = model(test_input1) + >>> output1.size() + >>> test_input2 = torch.zeros(5, 3, 9, 6) + >>> output2 = model(test_input2) + >>> output2.size() + """ + + def __init__(self, n_filters=128, max_len=500, pe_type="cosine"): + """ + :param n_filters: same with input hidden size + :param max_len: maximum sequence length + :param pe_type: cosine or linear or None + """ + super(PositionEncoding, self).__init__() + self.pe_type = pe_type + if pe_type != "none": + position = torch.arange(0, max_len).float().unsqueeze(1) + if pe_type == "cosine": + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, n_filters) # (L, D) + div_term = torch.exp(torch.arange(0, n_filters, 2).float() * - (math.log(10000.0) / n_filters)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + elif pe_type == "linear": + pe = position / max_len + else: + raise ValueError + self.register_buffer("pe", pe) # buffer is a tensor, not a variable, (L, D) + + def forward(self, x): + """ + :Input: (*, L, D) + :Output: (*, L, D) the same size as input + """ + if self.pe_type != "none": + pe = self.pe.data[:x.size(-2), :] # (#x.size(-2), n_filters) + extra_dim = len(x.size()) - 2 + for _ in range(extra_dim): + pe = pe.unsqueeze(0) + x = x + pe + return x + + +class LinearLayer(nn.Module): + """linear layer configurable with layer normalization, dropout, ReLU.""" + + def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True): + super(LinearLayer, self).__init__() + self.relu = relu + self.layer_norm = layer_norm + if layer_norm: + self.LayerNorm = nn.LayerNorm(in_hsz) + layers = [ + nn.Dropout(dropout), + nn.Linear(in_hsz, out_hsz) + ] + self.net = nn.Sequential(*layers) + + def forward(self, x): + """(N, L, D)""" + if self.layer_norm: + x = self.LayerNorm(x) + x = self.net(x) + if self.relu: + x = F.relu(x, inplace=True) + return x # (N, L, D) + + +bert_config = dict( + hidden_size=768, + intermediate_size=768, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_attention_heads=4, +) + + +class BertLayer(nn.Module): + def __init__(self, config, use_self_attention=True): + super(BertLayer, self).__init__() + self.use_self_attention = use_self_attention + if use_self_attention: + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask): + """ + Args: + hidden_states: (N, L, D) + attention_mask: (N, L) with 1 indicate valid, 0 indicates invalid + Returns: + + """ + if self.use_self_attention: + attention_output = self.attention(hidden_states, attention_mask) + else: + attention_output = hidden_states + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + + def forward(self, input_tensor, attention_mask): + """ + Args: + input_tensor: (N, L, D) + attention_mask: (N, L) + Returns: + """ + self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) + return attention_output + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Sequential( + nn.Linear(config.hidden_size, config.intermediate_size), + nn.ReLU(True)) + + def forward(self, hidden_states): + return self.dense(hidden_states) + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) # (N, L, nh, dh) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) # (N, nh, L, dh) + + def forward(self, query_states, key_states, value_states, attention_mask): + """ + Args: + query_states: (N, Lq, D) + key_states: (N, L, D) + value_states: (N, L, D) + attention_mask: (N, Lq, L) + Returns: + """ + # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last) + # will be ignored in future computation anyway + attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000. # (N, 1, Lq, L) + mixed_query_layer = self.query(query_states) + mixed_key_layer = self.key(key_states) + mixed_value_layer = self.value(value_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) # (N, nh, Lq, dh) + key_layer = self.transpose_for_scores(mixed_key_layer) # (N, nh, L, dh) + value_layer = self.transpose_for_scores(mixed_value_layer) # (N, nh, L, dh) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) # (N, nh, Lq, L) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + return context_layer + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states diff --git a/baselines/crossmodal_moment_localization/model_xml.py b/baselines/crossmodal_moment_localization/model_xml.py new file mode 100644 index 0000000000000000000000000000000000000000..4b0fea8ec9841755d7e27d0bab31a2409e2dc981 --- /dev/null +++ b/baselines/crossmodal_moment_localization/model_xml.py @@ -0,0 +1,642 @@ +import math +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from easydict import EasyDict as edict +from baselines.crossmodal_moment_localization.model_components import \ + BertAttention, PositionEncoding, LinearLayer, BertSelfAttention, TrainablePositionalEncoding, ConvEncoder +from utils.model_utils import RNNEncoder + +base_bert_layer_config = dict( + hidden_size=768, + intermediate_size=768, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_attention_heads=4, +) + +xml_base_config = edict( + merge_two_stream=True, # merge only the scores + cross_att=True, # cross-attention for video and subtitles + span_predictor_type="conv", + encoder_type="transformer", # cnn, transformer, lstm, gru + add_pe_rnn=False, # add positional encoding for RNNs, (LSTM and GRU) + visual_input_size=2048, # changes based on visual input type + query_input_size=768, + sub_input_size=768, + hidden_size=500, # + conv_kernel_size=5, # conv kernel_size for st_ed predictor + stack_conv_predictor_conv_kernel_sizes=-1, # Do not use + conv_stride=1, # + max_ctx_l=100, + max_desc_l=30, + input_drop=0.1, # dropout for input + drop=0.1, # dropout for other layers + n_heads=4, # self attention heads + ctx_mode="video_sub", # which context are used. 'video', 'sub' or 'video_sub' + margin=0.1, # margin for ranking loss + ranking_loss_type="hinge", # loss type, 'hinge' or 'lse' + lw_neg_q=1, # loss weight for neg. query and pos. context + lw_neg_ctx=1, # loss weight for pos. query and neg. context + lw_st_ed=1, # loss weight for st ed prediction + use_hard_negative=False, # use hard negative at video level, we may change it during training. + hard_pool_size=20, + use_self_attention=True, + no_modular=False, + pe_type="none", # no positional encoding + initializer_range=0.02, +) + + +class XML(nn.Module): + def __init__(self, config): + super(XML, self).__init__() + self.config = config + # self.position_embeddings = PositionEncoding(n_filters=config.hidden_size, + # max_len=config.max_position_embeddings, + # pe_type=config.pe_type) + self.query_pos_embed = TrainablePositionalEncoding( + max_position_embeddings=config.max_desc_l, + hidden_size=config.hidden_size, dropout=config.input_drop) + self.ctx_pos_embed = TrainablePositionalEncoding( + max_position_embeddings=config.max_ctx_l, + hidden_size=config.hidden_size, dropout=config.input_drop) + self.query_input_proj = LinearLayer(config.query_input_size, + config.hidden_size, + layer_norm=True, + dropout=config.input_drop, + relu=True) + if config.encoder_type == "transformer": # self-att encoder + self.query_encoder = BertAttention(edict( + hidden_size=config.hidden_size, + intermediate_size=config.hidden_size, + hidden_dropout_prob=config.drop, + attention_probs_dropout_prob=config.drop, + num_attention_heads=config.n_heads, + )) + elif config.encoder_type == "cnn": + self.query_encoder = ConvEncoder( + kernel_size=5, + n_filters=config.hidden_size, + dropout=config.drop + ) + elif config.encoder_type in ["gru", "lstm"]: + self.query_encoder = RNNEncoder( + word_embedding_size=config.hidden_size, + hidden_size=config.hidden_size // 2, + bidirectional=True, + n_layers=1, + rnn_type=config.encoder_type, + return_outputs=True, + return_hidden=False + ) + + conv_cfg = dict(in_channels=1, + out_channels=1, + kernel_size=config.conv_kernel_size, + stride=config.conv_stride, + padding=config.conv_kernel_size // 2, + bias=False) + + cross_att_cfg = edict( + hidden_size=config.hidden_size, + num_attention_heads=config.n_heads, + attention_probs_dropout_prob=config.drop + ) + + self.use_video = "video" in config.ctx_mode + if self.use_video: + self.video_input_proj = LinearLayer(config.visual_input_size, + config.hidden_size, + layer_norm=True, + dropout=config.input_drop, + relu=True) + self.video_encoder1 = copy.deepcopy(self.query_encoder) + self.video_encoder2 = copy.deepcopy(self.query_encoder) + if self.config.cross_att: + self.video_cross_att = BertSelfAttention(cross_att_cfg) + self.video_cross_layernorm = nn.LayerNorm(config.hidden_size) + else: + if self.config.encoder_type == "transformer": + self.video_encoder3 = copy.deepcopy(self.query_encoder) + self.video_query_linear = nn.Linear(config.hidden_size, config.hidden_size) + if config.span_predictor_type == "conv": + if not config.merge_two_stream: + self.video_st_predictor = nn.Conv1d(**conv_cfg) + self.video_ed_predictor = nn.Conv1d(**conv_cfg) + elif config.span_predictor_type == "cat_linear": + self.video_st_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)]) + self.video_ed_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)]) + + self.use_sub = "sub" in config.ctx_mode + if self.use_sub: + self.sub_input_proj = LinearLayer(config.sub_input_size, + config.hidden_size, + layer_norm=True, + dropout=config.input_drop, + relu=True) + self.sub_encoder1 = copy.deepcopy(self.query_encoder) + self.sub_encoder2 = copy.deepcopy(self.query_encoder) + if self.config.cross_att: + self.sub_cross_att = BertSelfAttention(cross_att_cfg) + self.sub_cross_layernorm = nn.LayerNorm(config.hidden_size) + else: + if self.config.encoder_type == "transformer": + self.sub_encoder3 = copy.deepcopy(self.query_encoder) + self.sub_query_linear = nn.Linear(config.hidden_size, config.hidden_size) + if config.span_predictor_type == "conv": + if not config.merge_two_stream: + self.sub_st_predictor = nn.Conv1d(**conv_cfg) + self.sub_ed_predictor = nn.Conv1d(**conv_cfg) + elif config.span_predictor_type == "cat_linear": + self.sub_st_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)]) + self.sub_ed_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)]) + + self.modular_vector_mapping = nn.Linear(in_features=config.hidden_size, + out_features=self.use_sub + self.use_video, + bias=False) + + self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean") + + if config.merge_two_stream and config.span_predictor_type == "conv": + if self.config.stack_conv_predictor_conv_kernel_sizes == -1: + self.merged_st_predictor = nn.Conv1d(**conv_cfg) + self.merged_ed_predictor = nn.Conv1d(**conv_cfg) + else: + print("Will be using multiple Conv layers for prediction.") + self.merged_st_predictors = nn.ModuleList() + self.merged_ed_predictors = nn.ModuleList() + num_convs = len(self.config.stack_conv_predictor_conv_kernel_sizes) + for k in self.config.stack_conv_predictor_conv_kernel_sizes: + conv_cfg = dict(in_channels=1, + out_channels=1, + kernel_size=k, + stride=config.conv_stride, + padding=k // 2, + bias=False) + self.merged_st_predictors.append(nn.Conv1d(**conv_cfg)) + self.merged_ed_predictors.append(nn.Conv1d(**conv_cfg)) + self.combine_st_conv = nn.Linear(num_convs, 1, bias=False) + self.combine_ed_conv = nn.Linear(num_convs, 1, bias=False) + + self.reset_parameters() + + def reset_parameters(self): + """ Initialize the weights.""" + + def re_init(module): + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + module.reset_parameters() + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + self.apply(re_init) + + def set_hard_negative(self, use_hard_negative, hard_pool_size): + """use_hard_negative: bool; hard_pool_size: int, """ + self.config.use_hard_negative = use_hard_negative + self.config.hard_pool_size = hard_pool_size + + def set_train_st_ed(self, lw_st_ed): + """pre-train video retrieval then span prediction""" + self.config.lw_st_ed = lw_st_ed + + def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, + tef_feat, tef_mask, st_ed_indices): + """ + Args: + query_feat: (N, Lq, Dq) + query_mask: (N, Lq) + video_feat: (N, Lv, Dv) or None + video_mask: (N, Lv) or None + sub_feat: (N, Lv, Ds) or None + sub_mask: (N, Lv) or None + tef_feat: (N, Lv, 2) or None, + tef_mask: (N, Lv) or None, + st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively. + """ + video_feat1, video_feat2, sub_feat1, sub_feat2 = \ + self.encode_context(video_feat, video_mask, sub_feat, sub_mask) + + query_context_scores, st_prob, ed_prob = \ + self.get_pred_from_raw_query(query_feat, query_mask, + video_feat1, video_feat2, video_mask, + sub_feat1, sub_feat2, sub_mask, cross=False) + + loss_st_ed = 0 + if self.config.lw_st_ed != 0: + loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0]) + loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1]) + loss_st_ed = loss_st + loss_ed + + loss_neg_ctx, loss_neg_q = 0, 0 + if self.config.lw_neg_ctx != 0 or self.config.lw_neg_q != 0: + loss_neg_ctx, loss_neg_q = self.get_video_level_loss(query_context_scores) + + loss_st_ed = self.config.lw_st_ed * loss_st_ed + loss_neg_ctx = self.config.lw_neg_ctx * loss_neg_ctx + loss_neg_q = self.config.lw_neg_q * loss_neg_q + loss = loss_st_ed + loss_neg_ctx + loss_neg_q + return loss, {"loss_st_ed": float(loss_st_ed), + "loss_neg_ctx": float(loss_neg_ctx), + "loss_neg_q": float(loss_neg_q), + "loss_overall": float(loss)} + + def get_visualization_data(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, + tef_feat, tef_mask, st_ed_indices): + assert self.config.merge_two_stream and self.use_video and self.use_sub and not self.config.no_modular + video_feat1, video_feat2, sub_feat1, sub_feat2 = \ + self.encode_context(video_feat, video_mask, sub_feat, sub_mask) + encoded_query = self.encode_input(query_feat, query_mask, + self.query_input_proj, self.query_encoder, self.query_pos_embed) # (N, Lq, D) + # (N, D), (N, D), (N, L, 2) + video_query, sub_query, modular_att_scores = \ + self.get_modularized_queries(encoded_query, query_mask, return_modular_att=True) + # (N, L), (N, L), (N, L) + st_prob, ed_prob, similarity_scores, video_similarity, sub_similarity = self.get_merged_st_ed_prob( + video_query, video_feat2, sub_query, sub_feat2, video_mask, cross=False, return_similaity=True) + + # clean up invalid bits + data = dict(modular_att_scores=modular_att_scores.cpu().numpy(), # (N, Lq, 2), row 0, 1 are video, sub. + st_prob=st_prob.cpu().numpy(), # (N, L) + ed_prob=ed_prob.cpu().numpy(), # (N, L) + similarity_scores=similarity_scores.cpu().numpy(), # (N, L) + video_similarity=video_similarity.cpu().numpy(), # (N, L) + sub_similarity=sub_similarity.cpu().numpy(), # (N, L) + st_ed_indices=st_ed_indices.cpu().numpy()) # (N, L) + query_lengths = query_mask.sum(1).to(torch.long).cpu().tolist() # (N, ) + ctx_lengths = video_mask.sum(1).to(torch.long).cpu().tolist() # (N, ) + # print("query_lengths {}".format((type(query_lengths), len(query_lengths), query_lengths[:10]))) + for k, v in data.items(): + if k == "modular_att_scores": + # print(k, v, v.shape, type(v)) + data[k] = [e[:l] for l, e in zip(query_lengths, v)] # list(e) where e is (Lq_i, 2) + else: + data[k] = [e[:l] for l, e in zip(ctx_lengths, v)] # list(e) where e is (Lc_i) + + # aggregate info for each example + datalist = [] + for idx in range(len(data["modular_att_scores"])): + datalist.append({k: v[idx] for k, v in data.items()}) + return datalist # list(dicts) of length N + + def encode_query(self, query_feat, query_mask): + encoded_query = self.encode_input(query_feat, query_mask, + self.query_input_proj, self.query_encoder, self.query_pos_embed) # (N, Lq, D) + video_query, sub_query = self.get_modularized_queries(encoded_query, query_mask) # (N, D) * 2 + return video_query, sub_query + + def non_cross_encode_context(self, context_feat, context_mask, module_name="video"): + encoder_layer3 = getattr(self, module_name + "_encoder3") \ + if self.config.encoder_type == "transformer" else None + return self._non_cross_encode_context(context_feat, context_mask, + input_proj_layer=getattr(self, module_name + "_input_proj"), + encoder_layer1=getattr(self, module_name + "_encoder1"), + encoder_layer2=getattr(self, module_name + "_encoder2"), + encoder_layer3=encoder_layer3) + + def _non_cross_encode_context(self, context_feat, context_mask, input_proj_layer, + encoder_layer1, encoder_layer2, encoder_layer3=None): + """ + Args: + context_feat: (N, L, D) + context_mask: (N, L) + input_proj_layer: + encoder_layer1: + encoder_layer2: + encoder_layer3 + """ + context_feat1 = self.encode_input( + context_feat, context_mask, input_proj_layer, encoder_layer1, self.ctx_pos_embed) # (N, L, D) + if self.config.encoder_type in ["transformer", "cnn"]: + context_mask = context_mask.unsqueeze(1) # (N, 1, L), torch.FloatTensor + context_feat2 = encoder_layer2(context_feat1, context_mask) # (N, L, D) + if self.config.encoder_type == "transformer": + context_feat2 = encoder_layer3(context_feat2, context_mask) + elif self.config.encoder_type in ["gru", "lstm"]: + context_mask = context_mask.sum(1).long() # (N, ), torch.LongTensor + context_feat2 = encoder_layer2(context_feat1, context_mask)[0] # (N, L, D) + else: + raise NotImplementedError + return context_feat1, context_feat2 + + def encode_context(self, video_feat, video_mask, sub_feat, sub_mask): + if self.config.cross_att: + assert self.use_video and self.use_sub + + return self.cross_encode_context(video_feat, video_mask, sub_feat, sub_mask) + else: + video_feat1, video_feat2 = (None,) * 2 + if self.use_video: + video_feat1, video_feat2 = self.non_cross_encode_context(video_feat, video_mask, module_name="video") + sub_feat1, sub_feat2 = (None,) * 2 + if self.use_sub: + sub_feat1, sub_feat2 = self.non_cross_encode_context(sub_feat, sub_mask, module_name="sub") + return video_feat1, video_feat2, sub_feat1, sub_feat2 + + def cross_encode_context(self, video_feat, video_mask, sub_feat, sub_mask): + encoded_video_feat = self.encode_input(video_feat, video_mask, + self.video_input_proj, self.video_encoder1, self.ctx_pos_embed) + encoded_sub_feat = self.encode_input(sub_feat, sub_mask, + self.sub_input_proj, self.sub_encoder1, self.ctx_pos_embed) + x_encoded_video_feat = self.cross_context_encoder( + encoded_video_feat, video_mask, encoded_sub_feat, sub_mask, + self.video_cross_att, self.video_cross_layernorm, self.video_encoder2) # (N, L, D) + x_encoded_sub_feat = self.cross_context_encoder( + encoded_sub_feat, sub_mask, encoded_video_feat, video_mask, + self.sub_cross_att, self.sub_cross_layernorm, self.sub_encoder2) # (N, L, D) + return encoded_video_feat, x_encoded_video_feat, encoded_sub_feat, x_encoded_sub_feat + + def cross_context_encoder(self, main_context_feat, main_context_mask, side_context_feat, side_context_mask, + cross_att_layer, norm_layer, self_att_layer): + """ + Args: + main_context_feat: (N, Lq, D) + main_context_mask: (N, Lq) + side_context_feat: (N, Lk, D) + side_context_mask: (N, Lk) + cross_att_layer: + norm_layer: + self_att_layer: + """ + cross_mask = torch.einsum("bm,bn->bmn", main_context_mask, side_context_mask) # (N, Lq, Lk) + cross_out = cross_att_layer(main_context_feat, side_context_feat, side_context_feat, cross_mask) # (N, Lq, D) + residual_out = norm_layer(cross_out + main_context_feat) + if self.config.encoder_type in ["cnn", "transformer"]: + return self_att_layer(residual_out, main_context_mask.unsqueeze(1)) + elif self.config.encoder_type in ["gru", "lstm"]: + return self_att_layer(residual_out, main_context_mask.sum(1).long())[0] + + def encode_input(self, feat, mask, input_proj_layer, encoder_layer, pos_embed_layer): + """ + Args: + feat: (N, L, D_input), torch.float32 + mask: (N, L), torch.float32, with 1 indicates valid query, 0 indicates mask + input_proj_layer: down project input + encoder_layer: encoder layer + # add_pe: bool, whether to add positional encoding + pos_embed_layer + """ + feat = input_proj_layer(feat) + + if self.config.encoder_type in ["cnn", "transformer"]: + feat = pos_embed_layer(feat) + mask = mask.unsqueeze(1) # (N, 1, L), torch.FloatTensor + return encoder_layer(feat, mask) # (N, L, D_hidden) + elif self.config.encoder_type in ["gru", "lstm"]: + if self.config.add_pe_rnn: + feat = pos_embed_layer(feat) + mask = mask.sum(1).long() # (N, ), torch.LongTensor + return encoder_layer(feat, mask)[0] # (N, L, D_hidden) + + def get_modularized_queries(self, encoded_query, query_mask, return_modular_att=False): + """ + Args: + encoded_query: (N, L, D) + query_mask: (N, L) + return_modular_att: bool + """ + if self.config.no_modular: + modular_query = torch.max(mask_logits(encoded_query, query_mask.unsqueeze(2)), dim=1)[0] # (N, D) + return modular_query, modular_query # + else: + modular_attention_scores = self.modular_vector_mapping(encoded_query) # (N, L, 2 or 1) + modular_attention_scores = F.softmax( + mask_logits(modular_attention_scores, query_mask.unsqueeze(2)), dim=1) + # TODO check whether it is the same + modular_queries = torch.einsum("blm,bld->bmd", + modular_attention_scores, encoded_query) # (N, 2 or 1, D) + if return_modular_att: + assert modular_queries.shape[1] == 2 + return modular_queries[:, 0], modular_queries[:, 1], modular_attention_scores + else: + if modular_queries.shape[1] == 2: + return modular_queries[:, 0], modular_queries[:, 1] # (N, D) * 2 + else: # 1 + return modular_queries[:, 0], modular_queries[:, 0] # the same + + def get_modular_weights(self, encoded_query, query_mask): + """ + Args: + encoded_query: (N, L, D) + query_mask: (N, L) + """ + max_encoded_query, _ = torch.max(mask_logits(encoded_query, query_mask.unsqueeze(2)), dim=1) # (N, D) + modular_weights = self.modular_weights_calculator(max_encoded_query) # (N, 2) + modular_weights = F.softmax(modular_weights, dim=-1) + return modular_weights[:, 0:1], modular_weights[:, 1:2] # (N, 1) * 2 + + def get_video_level_scores(self, modularied_query, context_feat1, context_mask): + """ Calculate video2query scores for each pair of video and query inside the batch. + Args: + modularied_query: (N, D) + context_feat1: (N, L, D), output of the first transformer encoder layer + context_mask: (N, L) + Returns: + context_query_scores: (N, N) score of each query w.r.t. each video inside the batch, + diagonal positions are positive. used to get negative samples. + """ + modularied_query = F.normalize(modularied_query, dim=-1) + context_feat1 = F.normalize(context_feat1, dim=-1) + query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat1) # (N, L, N) + context_mask = context_mask.transpose(0, 1).unsqueeze(0) # (1, L, N) + query_context_scores = mask_logits(query_context_scores, context_mask) # (N, L, N) + query_context_scores, _ = torch.max(query_context_scores, + dim=1) # (N, N) diagonal positions are positive pairs. + return query_context_scores + + def get_merged_st_ed_prob(self, video_query, video_feat, sub_query, sub_feat, context_mask, + cross=False, return_similaity=False): + """context_mask could be either video_mask or sub_mask, since they are the same""" + assert self.use_video and self.use_sub and self.config.span_predictor_type == "conv" + video_query = self.video_query_linear(video_query) + sub_query = self.sub_query_linear(sub_query) + stack_conv = self.config.stack_conv_predictor_conv_kernel_sizes != -1 + num_convs = len(self.config.stack_conv_predictor_conv_kernel_sizes) if stack_conv else None + if cross: + video_similarity = torch.einsum("md,nld->mnl", video_query, video_feat) + sub_similarity = torch.einsum("md,nld->mnl", sub_query, sub_feat) + similarity = (video_similarity + sub_similarity) / 2 # (Nq, Nv, L) from query to all videos. + n_q, n_c, l = similarity.shape + similarity = similarity.view(n_q * n_c, 1, l) + if not stack_conv: + st_prob = self.merged_st_predictor(similarity).view(n_q, n_c, l) # (Nq, Nv, L) + ed_prob = self.merged_ed_predictor(similarity).view(n_q, n_c, l) # (Nq, Nv, L) + else: + st_prob_list = [] + ed_prob_list = [] + for idx in range(num_convs): + st_prob_list.append(self.merged_st_predictors[idx](similarity).squeeze().unsqueeze(2)) + ed_prob_list.append(self.merged_ed_predictors[idx](similarity).squeeze().unsqueeze(2)) + # (Nq*Nv, L, 3) --> (Nq*Nv, L) -> (Nq, Nv, L) + st_prob = self.combine_st_conv(torch.cat(st_prob_list, dim=2)).view(n_q, n_c, l) + ed_prob = self.combine_ed_conv(torch.cat(ed_prob_list, dim=2)).view(n_q, n_c, l) + else: + video_similarity = torch.einsum("bd,bld->bl", video_query, video_feat) # (N, L) + sub_similarity = torch.einsum("bd,bld->bl", sub_query, sub_feat) # (N, L) + similarity = (video_similarity + sub_similarity) / 2 + if not stack_conv: + st_prob = self.merged_st_predictor(similarity.unsqueeze(1)).squeeze() # (N, L) + ed_prob = self.merged_ed_predictor(similarity.unsqueeze(1)).squeeze() # (N, L) + else: + st_prob_list = [] + ed_prob_list = [] + for idx in range(num_convs): + st_prob_list.append(self.merged_st_predictors[idx](similarity.unsqueeze(1)).squeeze().unsqueeze(2)) + ed_prob_list.append(self.merged_ed_predictors[idx](similarity.unsqueeze(1)).squeeze().unsqueeze(2)) + st_prob = self.combine_st_conv(torch.cat(st_prob_list, dim=2)).squeeze() # (N, L, 3) --> (N, L) + ed_prob = self.combine_ed_conv(torch.cat(ed_prob_list, dim=2)).squeeze() # (N, L, 3) --> (N, L) + st_prob = mask_logits(st_prob, context_mask) # (N, L) + ed_prob = mask_logits(ed_prob, context_mask) + if return_similaity: + assert not cross + return st_prob, ed_prob, similarity, video_similarity, sub_similarity + else: + return st_prob, ed_prob + + def get_st_ed_prob(self, modularied_query, context_feat2, context_mask, + module_name="video", cross=False): + return self._get_st_ed_prob(modularied_query, context_feat2, context_mask, + module_query_linear=getattr(self, module_name + "_query_linear"), + st_predictor=getattr(self, module_name + "_st_predictor"), + ed_predictor=getattr(self, module_name + "_ed_predictor"), + cross=cross) + + def _get_st_ed_prob(self, modularied_query, context_feat2, context_mask, + module_query_linear, st_predictor, ed_predictor, cross=False): + """ + Args: + modularied_query: (N, D) + context_feat2: (N, L, D), output of the first transformer encoder layer + context_mask: (N, L) + module_query_linear: + st_predictor: + ed_predictor: + cross: at inference, calculate prob for each possible pairs of query and context. + """ + query = module_query_linear(modularied_query) # (N, D) no need to normalize here. + if cross: + if self.config.span_predictor_type == "conv": + similarity = torch.einsum("md,nld->mnl", query, context_feat2) # (Nq, Nv, L) from query to all videos. + n_q, n_c, l = similarity.shape + similarity = similarity.view(n_q * n_c, 1, l) + st_prob = st_predictor(similarity).view(n_q, n_c, l) # (Nq, Nv, L) + ed_prob = ed_predictor(similarity).view(n_q, n_c, l) # (Nq, Nv, L) + elif self.config.span_predictor_type == "cat_linear": + st_prob_q = st_predictor[0](query).unsqueeze(1) # (Nq, 1, 1) + st_prob_ctx = st_predictor[1](context_feat2).squeeze().unsqueeze(0) # (1, Nv, L) + st_prob = st_prob_q + st_prob_ctx # (Nq, Nv, L) + ed_prob_q = ed_predictor[0](query).unsqueeze(1) # (Nq, 1, 1) + ed_prob_ctx = ed_predictor[1](context_feat2).squeeze().unsqueeze(0) # (1, Nv, L) + ed_prob = ed_prob_q + ed_prob_ctx # (Nq, Nv, L) + context_mask = context_mask.unsqueeze(0) # (1, Nv, L) + else: + if self.config.span_predictor_type == "conv": + similarity = torch.einsum("bd,bld->bl", query, context_feat2) # (N, L) + st_prob = st_predictor(similarity.unsqueeze(1)).squeeze() # (N, L) + ed_prob = ed_predictor(similarity.unsqueeze(1)).squeeze() # (N, L) + elif self.config.span_predictor_type == "cat_linear": + # avoid concatenation by break into smaller matrix multiplications. + st_prob = st_predictor[0](query) + st_predictor[1](context_feat2).squeeze() # (N, L) + ed_prob = ed_predictor[0](query) + ed_predictor[1](context_feat2).squeeze() # (N, L) + st_prob = mask_logits(st_prob, context_mask) # (N, L) + ed_prob = mask_logits(ed_prob, context_mask) + return st_prob, ed_prob + + def get_pred_from_raw_query(self, query_feat, query_mask, + video_feat1, video_feat2, video_mask, + sub_feat1, sub_feat2, sub_mask, cross=False): + """ + Args: + query_feat: (N, Lq, Dq) + query_mask: (N, Lq) + video_feat1: (N, Lv, D) or None + video_feat2: + video_mask: (N, Lv) + sub_feat1: (N, Lv, D) or None + sub_feat2: + sub_mask: (N, Lv) + cross: + """ + video_query, sub_query = self.encode_query(query_feat, query_mask) + divisor = self.use_sub + self.use_video + + # get video-level retrieval scores + video_q2ctx_scores = self.get_video_level_scores(video_query, video_feat1, video_mask) if self.use_video else 0 + sub_q2ctx_scores = self.get_video_level_scores(sub_query, sub_feat1, sub_mask) if self.use_sub else 0 + q2ctx_scores = (video_q2ctx_scores + sub_q2ctx_scores) / divisor # (N, N) + + if self.config.merge_two_stream and self.use_video and self.use_sub: + st_prob, ed_prob = self.get_merged_st_ed_prob( + video_query, video_feat2, sub_query, sub_feat2, video_mask, cross=cross) + else: + video_st_prob, video_ed_prob = self.get_st_ed_prob( + video_query, video_feat2, video_mask, module_name="video", cross=cross) if self.use_video else (0, 0) + sub_st_prob, sub_ed_prob = self.get_st_ed_prob( + sub_query, sub_feat2, sub_mask, module_name="sub", cross=cross) if self.use_sub else (0, 0) + st_prob = (video_st_prob + sub_st_prob) / divisor # (N, Lv) + ed_prob = (video_ed_prob + sub_ed_prob) / divisor # (N, Lv) + return q2ctx_scores, st_prob, ed_prob # un-normalized masked probabilities!!!!! + + def get_video_level_loss(self, query_context_scores): + """ ranking loss between (pos. query + pos. video) and (pos. query + neg. video) or (neg. query + pos. video) + Args: + query_context_scores: (N, N), cosine similarity [-1, 1], + Each row contains the scores between the query to each of the videos inside the batch. + """ + bsz = len(query_context_scores) + diagonal_indices = torch.arange(bsz).to(query_context_scores.device) + pos_scores = query_context_scores[diagonal_indices, diagonal_indices] # (N, ) + query_context_scores_masked = copy.deepcopy(query_context_scores.data) + # impossibly large for cosine similarity, the copy is created as modifying the original will cause error + query_context_scores_masked[diagonal_indices, diagonal_indices] = 999 + pos_query_neg_context_scores = self.get_neg_scores(query_context_scores, + query_context_scores_masked) + neg_query_pos_context_scores = self.get_neg_scores(query_context_scores.transpose(0, 1), + query_context_scores_masked.transpose(0, 1)) + loss_neg_ctx = self.get_ranking_loss(pos_scores, pos_query_neg_context_scores) + loss_neg_q = self.get_ranking_loss(pos_scores, neg_query_pos_context_scores) + return loss_neg_ctx, loss_neg_q + + def get_neg_scores(self, scores, scores_masked): + """ + scores: (N, N), cosine similarity [-1, 1], + Each row are scores: query --> all videos. Transposed version: video --> all queries. + scores_masked: (N, N) the same as scores, except that the diagonal (positive) positions + are masked with a large value. + """ + bsz = len(scores) + batch_indices = torch.arange(bsz).to(scores.device) + _, sorted_scores_indices = torch.sort(scores_masked, descending=True, dim=1) + sample_min_idx = 1 # skip the masked positive + sample_max_idx = min(sample_min_idx + self.config.hard_pool_size, bsz) \ + if self.config.use_hard_negative else bsz + sampled_neg_score_indices = sorted_scores_indices[ + batch_indices, torch.randint(sample_min_idx, sample_max_idx, size=(bsz,)).to(scores.device)] # (N, ) + sampled_neg_scores = scores[batch_indices, sampled_neg_score_indices] # (N, ) + return sampled_neg_scores + + def get_ranking_loss(self, pos_score, neg_score): + """ Note here we encourage positive scores to be larger than negative scores. + Args: + pos_score: (N, ), torch.float32 + neg_score: (N, ), torch.float32 + """ + if self.config.ranking_loss_type == "hinge": # max(0, m + S_neg - S_pos) + return torch.clamp(self.config.margin + neg_score - pos_score, min=0).sum() / len(pos_score) + elif self.config.ranking_loss_type == "lse": # log[1 + exp(S_neg - S_pos)] + return torch.log1p(torch.exp(neg_score - pos_score)).sum() / len(pos_score) + else: + raise NotImplementedError("Only support 'hinge' and 'lse'") + + +def mask_logits(target, mask): + return target * mask + (1 - mask) * (-1e10) diff --git a/baselines/crossmodal_moment_localization/ndcg_iou_topk.py b/baselines/crossmodal_moment_localization/ndcg_iou_topk.py new file mode 100644 index 0000000000000000000000000000000000000000..dd61093f88290c30c1f7794a16c13477f8296729 --- /dev/null +++ b/baselines/crossmodal_moment_localization/ndcg_iou_topk.py @@ -0,0 +1,68 @@ +from utils.basic_utils import load_jsonl, save_jsonl, load_json +import pandas as pd +from tqdm import tqdm +import numpy as np +from collections import defaultdict +import copy + +def calculate_iou(pred_start: float, pred_end: float, gt_start: float, gt_end: float) -> float: + intersection_start = max(pred_start, gt_start) + intersection_end = min(pred_end, gt_end) + intersection = max(0, intersection_end - intersection_start) + union = (pred_end - pred_start) + (gt_end - gt_start) - intersection + return intersection / union if union > 0 else 0 + + +# Function to calculate DCG +def calculate_dcg(scores): + return sum((2**score - 1) / np.log2(idx + 2) for idx, score in enumerate(scores)) + +# Function to calculate NDCG +def calculate_ndcg(pred_scores, true_scores): + dcg = calculate_dcg(pred_scores) + idcg = calculate_dcg(sorted(true_scores, reverse=True)) + return dcg / idcg if idcg > 0 else 0 + + + +def calculate_ndcg_iou(all_gt, all_pred, TS, KS): + performance = defaultdict(lambda: defaultdict(list)) + performance_avg = defaultdict(lambda: defaultdict(float)) + for k in tqdm(all_pred.keys(), desc="Calculate NDCG"): + one_pred = all_pred[k] + one_gt = all_gt[k] + + one_gt.sort(key=lambda x: x["relevance"], reverse=True) + for T in TS: + one_gt_drop = copy.deepcopy(one_gt) + predictions_with_scores = [] + + for pred in one_pred: + pred_video_name, pred_time = pred["video_name"], pred["timestamp"] + matched_rows = [gt for gt in one_gt_drop if gt["video_name"] == pred_video_name] + if not matched_rows: + pred["pred_relevance"] = 0 + else: + ious = [calculate_iou(pred_time[0], pred_time[1], gt["timestamp"][0], gt["timestamp"][1]) for gt in matched_rows] + max_iou_idx = np.argmax(ious) + max_iou_row = matched_rows[max_iou_idx] + + if ious[max_iou_idx] > T: + pred["pred_relevance"] = max_iou_row["relevance"] + # Remove the matched ground truth row + original_idx = one_gt_drop.index(max_iou_row) + one_gt_drop.pop(original_idx) + else: + pred["pred_relevance"] = 0 + predictions_with_scores.append(pred) + for K in KS: + true_scores = [gt["relevance"] for gt in one_gt][:K] + pred_scores = [pred["pred_relevance"] for pred in predictions_with_scores][:K] + ndcg_score = calculate_ndcg(pred_scores, true_scores) + performance[K][T].append(ndcg_score) + for K, vs in performance.items(): + for T, v in vs.items(): + performance_avg[K][T] = np.mean(v) + return performance_avg + + diff --git a/baselines/crossmodal_moment_localization/optimization.py b/baselines/crossmodal_moment_localization/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..985765697f995e0d7821c1b945041b418bbec853 --- /dev/null +++ b/baselines/crossmodal_moment_localization/optimization.py @@ -0,0 +1,338 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for BERT model.""" + +import math +import torch +from torch.optim import Optimizer +from torch.optim.optimizer import required +from torch.nn.utils import clip_grad_norm_ +import logging +import abc +import sys + +logger = logging.getLogger(__name__) + + +if sys.version_info >= (3, 4): + ABC = abc.ABC +else: + ABC = abc.ABCMeta('ABC', (), {}) + + +class _LRSchedule(ABC): + """ Parent of all LRSchedules here. """ + warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense + def __init__(self, warmup=0.002, t_total=-1, **kw): + """ + :param warmup: what fraction of t_total steps will be used for linear warmup + :param t_total: how many training steps (updates) are planned + :param kw: + """ + super(_LRSchedule, self).__init__(**kw) + if t_total < 0: + logger.warning("t_total value of {} results in schedule not being applied".format(t_total)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + warmup = max(warmup, 0.) + self.warmup, self.t_total = float(warmup), float(t_total) + self.warned_for_t_total_at_progress = -1 + + def get_lr(self, step, nowarn=False): + """ + :param step: which of t_total steps we're on + :param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps + :return: learning rate multiplier for current update + """ + if self.t_total < 0: + return 1. + progress = float(step) / self.t_total + ret = self.get_lr_(progress) + # warning for exceeding t_total (only active with warmup_linear + if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress: + logger.warning( + "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly." + .format(ret, self.__class__.__name__)) + self.warned_for_t_total_at_progress = progress + # end warning + return ret + + @abc.abstractmethod + def get_lr_(self, progress): + """ + :param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress + :return: learning rate multiplier for current update + """ + return 1. + + +class ConstantLR(_LRSchedule): + def get_lr_(self, progress): + return 1. + + +class WarmupCosineSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve. + If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. + """ + warn_t_total = True + def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): + """ + :param warmup: see LRSchedule + :param t_total: see LRSchedule + :param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1. + :param kw: + """ + super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw) + self.cycles = cycles + + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + + +class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying + learning rate (with hard restarts). + """ + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + assert(cycles >= 1.) + + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1))) + return ret + + +class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule): + """ + All training progress is divided in `cycles` (default=1.) parts of equal length. + Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1., + followed by a learning rate decreasing from 1. to 0. following a cosine curve. + """ + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + assert(warmup * cycles < 1.) + warmup = warmup * cycles if warmup >= 0 else warmup + super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + + def get_lr_(self, progress): + progress = progress * self.cycles % 1. + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * progress)) + return ret + + +class WarmupConstantSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Keeps learning rate equal to 1. after warmup. + """ + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return 1. + + +class WarmupLinearSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps. + """ + warn_t_total = True + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return max((progress - 1.) / (self.warmup - 1.), 0.) + + +SCHEDULES = { + None: ConstantLR, + "none": ConstantLR, + "warmup_cosine": WarmupCosineSchedule, + "warmup_constant": WarmupConstantSchedule, + "warmup_linear": WarmupLinearSchedule +} + + +class EMA(object): + """ Exponential Moving Average for model parameters. + references: + [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py + [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py""" + def __init__(self, decay): + self.decay = decay + self.shadow = {} + self.original = {} + + def register(self, name, val): + self.shadow[name] = val.clone() + + def __call__(self, model, step): + decay = min(self.decay, (1 + step) / (10.0 + step)) + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + new_average = \ + (1.0 - decay) * param.data + decay * self.shadow[name] + self.shadow[name] = new_average.clone() + + def assign(self, model): + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + self.original[name] = param.data.clone() + param.data = self.shadow[name] + + def resume(self, model): + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + param.data = self.original[name] + + +class BertAdam(Optimizer): + """Implements BERT version of Adam algorithm with weight decay fix. + Params: + lr: learning rate + warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 + t_total: total number of training steps for the learning + rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1 + schedule: schedule to use for the warmup (see above). + Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below). + If `None` or `'none'`, learning rate is always kept constant. + Default : `'warmup_linear'` + b1: Adams b1. Default: 0.9 + b2: Adams b2. Default: 0.999 + e: Adams epsilon. Default: 1e-6 + weight_decay: Weight decay. Default: 0.01 + max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 + """ + def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', + b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs): + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) + if not e >= 0.0: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) + # initialize schedule object + if not isinstance(schedule, _LRSchedule): + schedule_type = SCHEDULES[schedule] + schedule = schedule_type(warmup=warmup, t_total=t_total) + else: + if warmup != -1 or t_total != -1: + logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. " + "Please specify custom warmup and t_total in _LRSchedule object.") + defaults = dict(lr=lr, schedule=schedule, + b1=b1, b2=b2, e=e, weight_decay=weight_decay, + max_grad_norm=max_grad_norm) + super(BertAdam, self).__init__(params, defaults) + + def get_lr(self): + lr = [] + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + if len(state) == 0: + return [0] + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + lr.append(lr_scheduled) + return lr + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['next_m'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['next_v'] = torch.zeros_like(p.data) + + next_m, next_v = state['next_m'], state['next_v'] + beta1, beta2 = group['b1'], group['b2'] + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + next_m.mul_(beta1).add_(grad, alpha=1 - beta1) + next_v.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + update = next_m / (next_v.sqrt() + group['e']) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want to decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if group['weight_decay'] > 0.0: + update += group['weight_decay'] * p.data + + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + + update_with_lr = lr_scheduled * update + p.data.add_(-update_with_lr) + + state['step'] += 1 + + # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 + # No bias correction + # bias_correction1 = 1 - beta1 ** state['step'] + # bias_correction2 = 1 - beta2 ** state['step'] + + return loss diff --git a/baselines/crossmodal_moment_localization/scripts/eval.sh b/baselines/crossmodal_moment_localization/scripts/eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..e75c03d2de065a0099d704482c77af481e127e8c --- /dev/null +++ b/baselines/crossmodal_moment_localization/scripts/eval.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/crossmodal_moment_localization/scripts/eval.sh ANY_OTHER_PYTHON_ARGS +eval_split_name=$1 +submission_path=$2 +save_path=$3 +gt_path=data/tvr_${eval_split_name}_release.jsonl + +python standalone_eval/eval.py \ +--gt_path ${gt_path} \ +--submission_path ${submission_path} \ +--save_path ${save_path} \ +${@:4} diff --git a/baselines/crossmodal_moment_localization/scripts/inference.sh b/baselines/crossmodal_moment_localization/scripts/inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..cabae575b68fb445567e9b52d4f5c4675022a82e --- /dev/null +++ b/baselines/crossmodal_moment_localization/scripts/inference.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/crossmodal_moment_localization/scripts/inference.sh ANY_OTHER_PYTHON_ARGS +model_dir=$1 +eval_split_name=$2 +eval_path=data/tvr_${eval_split_name}_release.jsonl +tasks=() +tasks+=(VCMR) +tasks+=(SVMR) +tasks+=(VR) +echo "tasks ${tasks[@]}" +python baselines/crossmodal_moment_localization/inference.py \ +--model_dir ${model_dir} \ +--tasks ${tasks[@]} \ +--eval_split_name ${eval_split_name} \ +--eval_path ${eval_path} \ +${@:3} diff --git a/baselines/crossmodal_moment_localization/scripts/inference_with_external.sh b/baselines/crossmodal_moment_localization/scripts/inference_with_external.sh new file mode 100644 index 0000000000000000000000000000000000000000..20bc039a01ca3dfe08744d8a61c88791678f4e3f --- /dev/null +++ b/baselines/crossmodal_moment_localization/scripts/inference_with_external.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/crossmodal_moment_localization/scripts/inference_with_external.sh +#model_dir=$1 +# DO not use NMS, since it gives worse results +eval_model=$1 # [xml, xml_tef] +eval_split_name=$2 +external_model=mee # [mee, mcn, cal] +eval_path=data/tvr_${eval_split_name}_release.jsonl +project_root=./baselines + +# setup eval model +if [[ ${eval_model} == xml ]]; then + eval_model_dir=tvr-video_sub-resnet_i3d_no_norm_v-2019_11_03_12_22_19 +elif [[ ${eval_model} == xml_tef ]]; then + eval_model_dir=tvr-video_sub_tef-resnet_i3d_no_norm_v-2019_11_03_12_53_01 +fi + +# setup external +if [[ ${external_model} == mee ]]; then + external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39 + external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json +fi + +tasks=(VR) +tasks+=(SVMR) +tasks+=(VCMR) +echo "tasks ${tasks[@]}" +python baselines/crossmodal_moment_localization/inference.py \ +--model_dir ${eval_model_dir} \ +--tasks ${tasks[@]} \ +--eval_split_name ${eval_split_name} \ +--eval_path ${eval_path} \ +--external_inference_vr_res_path ${external_inference_vr_res_path} \ +--eval_id ${external_model_dir} \ +${@:3} + +#--use_intermediate \ # temporary removed + diff --git a/baselines/crossmodal_moment_localization/scripts/train.sh b/baselines/crossmodal_moment_localization/scripts/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..4213ede6e5ef6c855b33d73b2795fea4f9cb9656 --- /dev/null +++ b/baselines/crossmodal_moment_localization/scripts/train.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/crossmodal_moment_localization/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS +# use --eval_tasks_at_training ["VR", "SVMR", "VCMR"] --stop_task ["VR", "SVMR", "VCMR"] for +# use --lw_neg_q 0 --lw_neg_ctx 0 for training SVMR/SVMR only +# use --lw_st_ed 0 for training with VR only +dset_name=$1 # see case below +ctx_mode=$2 # [video, sub, tef, video_sub, video_tef, sub_tef, video_sub_tef] +vid_feat_type=$3 # [resnet, i3d, resnet_i3d] +feature_root=data/tvr_feature_release +results_root=baselines/crossmodal_moment_localization/results +vid_feat_size=2048 +extra_args=() + +if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + if [[ ${dset_name} != "tvr" ]]; then + echo "The use of subtitles is only supported in tvr." + exit 1 + fi +fi + + +case ${dset_name} in + tvr) + train_path=data/tvr_train_release.jsonl + corpus_path=data/tvr_video2dur_idx.json + desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5 + if [[ ${vid_feat_type} == "i3d" ]]; then + echo "Using I3D feature with shape 1024" + vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 + vid_feat_size=1024 + elif [[ ${vid_feat_type} == "resnet" ]]; then + echo "Using ResNet feature with shape 2048" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5 + vid_feat_size=2048 + elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then + echo "Using concatenated ResNet and I3D feature with shape 2048+1024" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5 + vid_feat_size=3072 + extra_args+=(--no_norm_vfeat) # since they are already normalized. + fi + eval_split_name=val + nms_thd=-1 + extra_args+=(--eval_path) + extra_args+=(data/tvr_val_release.jsonl) + clip_length=1.5 + extra_args+=(--max_ctx_l) + extra_args+=(100) # max_ctx_l = 100 for clip_length = 1.5, only ~109/21825 has more than 100. + extra_args+=(--max_pred_l) + extra_args+=(16) + if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + echo "Running with sub." + desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5 # overwrite + sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5 + sub_feat_size=768 + extra_args+=(--sub_feat_size) + extra_args+=(${sub_feat_size}) + extra_args+=(--sub_bert_path) + extra_args+=(${sub_bert_path}) + fi + ;; + *) + echo -n "Unknown argument" + ;; +esac + +echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]" +echo "Extra args ${extra_args[@]}" +echo " python baselines/crossmodal_moment_localization/train.py --dset_name=${dset_name} --eval_split_name=${eval_split_name} --nms_thd=${nms_thd} --results_root=${results_root} --train_path=${train_path} --desc_bert_path=${desc_bert_path} --corpus_path=${corpus_path} --vid_feat_path=${vid_feat_path} --clip_length=${clip_length} --vid_feat_size=${vid_feat_size} --ctx_mode=${ctx_mode} ${extra_args[@]} ${@:4}" \ No newline at end of file diff --git a/baselines/crossmodal_moment_localization/start_end_dataset.py b/baselines/crossmodal_moment_localization/start_end_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c48735b97c9611b0113ae47bc6f67f7da640055d --- /dev/null +++ b/baselines/crossmodal_moment_localization/start_end_dataset.py @@ -0,0 +1,393 @@ +""" +Dataset for clip model +""" +import logging +import torch +from torch.utils.data import Dataset +import numpy as np +import h5py +import time +import math +import random +from tqdm import tqdm +from utils.basic_utils import load_json, load_json, l2_normalize_np_array, flat_list_of_lists, merge_dicts +from utils.tensor_utils import pad_sequences_1d +from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \ + get_didemo_agreed_ts +import pandas as pd + +logger = logging.getLogger(__name__) + + +class StartEndDataset(Dataset): + """ + Args: + dset_name, str, ["tvr"] + ctx_mode: str, + Return: + a dict: { + "meta": { + "query_id": int, + "desc": str, + "vid_name": str, + "duration": float, + "ts": [st (float), ed (float)], seconds, ground_truth timestamps + } + "model_inputs": { + "query_feat": torch.tensor, (L, D_q) + "video_feat": torch.tensor, (n_clip_in_moment, D_video) + "sub_feat": torch.tensor, (n_clip_in_moment, D_sub) + "st_ed_indices": torch.LongTensor, (2, ) + } + } + """ + def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, + max_desc_len, max_ctx_len, + vid_feat_path_or_handler, clip_length, ctx_mode="video", + normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0): + self.dset_name = dset_name + self.data_path = data_path + self.data_ratio = data_ratio + + self.desc_bert_path_or_handler = desc_bert_path_or_handler + self.max_desc_len = max_desc_len + + self.sub_bert_path_or_handler = sub_bert_path_or_handler + self.max_ctx_len = max_ctx_len + self.vid_feat_path_or_handler = vid_feat_path_or_handler + self.clip_length = clip_length + self.ctx_mode = ctx_mode + + # prepare desc data + self.data = self.expand_annotations(load_json(data_path)) + + if self.data_ratio != 1: + n_examples = int(len(self.data) * data_ratio) + self.data = self.data[:n_examples] + logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) + + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if self.use_video: + if isinstance(vid_feat_path_or_handler, h5py.File): + self.vid_feat_h5 = vid_feat_path_or_handler + else: # str path + self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) + + if isinstance(desc_bert_path_or_handler, h5py.File): + self.desc_bert_h5 = desc_bert_path_or_handler + else: + self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) + + if self.use_sub: + if isinstance(sub_bert_path_or_handler, h5py.File): + self.sub_bert_h5 = sub_bert_path_or_handler + else: # str path + self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) + + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + + def __len__(self): + return len(self.data) + + def expand_annotations(self, annotations): + new_annotations = [] + for i in annotations: + query = i["query"] + query_id = i["query_id"] + for moment in i["relevant_moment"]: + moment.update({'query': query, 'query_id': query_id}) + new_annotations.append(moment) + return new_annotations + + def __getitem__(self, index): + raw_data = self.data[index] + + # initialize with basic data + meta = dict( + query_id=raw_data["query_id"], + desc=raw_data["query"], + vid_name=raw_data["video_name"], + duration=raw_data["duration"], + ts=raw_data["timestamp"] , + ) + model_inputs = dict() + model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"]) + + ctx_l = 0 + if self.use_video: + video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clip, D) + if self.normalize_vfeat: + video_feat = l2_normalize_np_array(video_feat) + model_inputs["video_feat"] = torch.from_numpy(video_feat) + ctx_l = len(video_feat) + else: + model_inputs["video_feat"] = torch.zeros((2, 2)) + + if self.use_sub: # no need for ctx feature, as the features are already contextulized + sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clips, D_t) + if self.normalize_tfeat: + sub_feat = l2_normalize_np_array(sub_feat) + model_inputs["sub_feat"] = torch.from_numpy(sub_feat) + ctx_l = len(sub_feat) + else: + model_inputs["sub_feat"] = torch.zeros((2, 2)) + + if self.use_tef: + # note the tef features here are normalized clip indices (1.5 secs), instead of the original time (1 sec) + ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l + tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l + tef_ed = tef_st + 1.0 / ctx_l + tef = torch.stack([tef_st, tef_ed], dim=1) # (N_clips, 2) + model_inputs["tef_feat"] = tef + else: + model_inputs["tef_feat"] = torch.zeros((2, 2)) + + if self.use_video and self.use_tef: + model_inputs["video_feat"] = torch.cat( + [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D+2) + if self.use_sub and self.use_tef: + model_inputs["sub_feat"] = torch.cat( + [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D_t+2) + + model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], max_idx=ctx_l-1) + return dict(meta=meta, model_inputs=model_inputs) + + def get_st_ed_label(self, ts, max_idx): + """ + Args: + ts: [st (float), ed (float)] in seconds, ed > st + max_idx: length of the video + + Returns: + [st_idx, ed_idx]: int, + + Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6, + clips should be indexed as [2: 6), the translated back ts should be [3:9]. + # TODO which one is better, [2: 5] or [2: 6) + """ + st_idx = min(math.floor(ts[0] / self.clip_length), max_idx) + ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx) + return torch.LongTensor([st_idx, ed_idx]) + + def get_query_feat_by_query_id(self, query_id): + query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + return torch.from_numpy(query_feat) + + +class StartEndEvalDataset(Dataset): + """ + init_data_mode: `video_query` or `video_only` or `query_only`, + it indicates which data to load when initialize the Dataset object. + data_mode: `context` or `query`, it indicates which data to return for self.__get_item__() + desc_bert_path_or_handler: h5py.File object or str path + vid_feat_path_or_handler: h5py.File object or str path + eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with + max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals. + load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval. + data_ratio: percentage of query data to use. + """ + def __init__(self, data_path=None, + desc_bert_path_or_handler=None, max_desc_len=None, max_ctx_len=None, + sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, + corpus_path=None, clip_length=None, + ctx_mode="video", data_mode="context", + h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True): + self.ctx_mode = ctx_mode + self.load_gt_video = False + self.data_ratio = data_ratio # only affect query data + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + + self.data_mode = None + self.set_data_mode(data_mode) + + self.max_desc_len = max_desc_len + self.max_ctx_len = max_ctx_len + self.data_path = data_path + + + self.annotations = load_json(data_path) + self.ground_truth = self.get_relevant_moment_gt() + + + if isinstance(desc_bert_path_or_handler, h5py.File): + self.desc_bert_h5 = desc_bert_path_or_handler + else: + self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) + + video_data = load_json(corpus_path) + self.video_data = [{"vid_name": k, "duration": v} for k, v in video_data.items()] + self.video2idx = {k: v for k, v in video_data.items()} + self.clip_length = clip_length + + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if self.use_video: + if isinstance(vid_feat_path_or_handler, h5py.File): + self.vid_feat_h5 = vid_feat_path_or_handler + else: # str path + self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) + + if self.use_sub: + if isinstance(sub_bert_path_or_handler, h5py.File): + self.sub_bert_h5 = sub_bert_path_or_handler + else: # str path + self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) + + + def get_relevant_moment_gt(self): + gt_all = {} + for data in self.annotations: + gt_all[data["query_id"]] = data["relevant_moment"] + return gt_all + + def set_data_mode(self, data_mode): + """context or query""" + assert data_mode in ["context", "query"] + self.data_mode = data_mode + + # def load_gt_vid_name_for_query(self, load_gt_video): + # """load_gt_video: bool, affect the returned value of self._get_item_query""" + # if load_gt_video: + # assert "vid_name" in self.query_data[0] + # self.load_gt_video = load_gt_video + + def __len__(self): + if self.data_mode == "context": + return len(self.video_data) + else: + return len(self.annotations) + + def __getitem__(self, index): + if self.data_mode == "context": + return self._get_item_context(index) + else: + return self._get_item_query(index) + + def get_query_feat_by_query_id(self, query_id): + query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + return torch.from_numpy(query_feat) + + def _get_item_query(self, index): + """Need to batch""" + raw_data = self.annotations[index] + + meta = dict( + query_id=raw_data["query_id"], + desc=raw_data["query"], + vid_name=raw_data["video_name"] if self.load_gt_video else None + ) + + model_inputs = dict() + model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"]) + return dict(meta=meta, model_inputs=model_inputs) + + def get_st_ed_label(self, ts, max_idx): + """ + Args: + ts: [st (float), ed (float)] in seconds, ed > st + max_idx: length of the video + + Returns: + [st_idx, ed_idx]: int, + + Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6, + clips should be indexed as [2: 6), the translated back ts should be [3:9]. + Given ts = [5, 9], st_idx = 3, ed_idx = 6, + clips should be indexed as [3: 6), the translated back ts should be [4.5:9]. + # TODO which one is better, [2: 5] or [2: 6) + """ + # TODO ed_idx -= 1, should also modify relevant code in inference.py + st_idx = min(math.floor(ts[0] / self.clip_length), max_idx) + ed_idx = min(math.ceil(ts[1] / self.clip_length) - 1, max_idx) # st_idx could be the same as ed_idx + return torch.LongTensor([st_idx, ed_idx]) + + def _get_item_context(self, index): + """No need to batch, since it has already been batched here""" + raw_data = self.video_data[index] + + # initialize with basic data + meta = dict( + vid_name=raw_data["vid_name"], + duration=raw_data["duration"], + ) + + model_inputs = dict() + ctx_l = 0 + + if self.use_video: + video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clip, D) + if self.normalize_vfeat: + video_feat = l2_normalize_np_array(video_feat) + model_inputs["video_feat"] = torch.from_numpy(video_feat) + ctx_l = len(video_feat) + else: + model_inputs["video_feat"] = torch.zeros((2, 2)) + + if self.use_sub: # no need for ctx feature, as the features are already contextulized + sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clips, D_t) + if self.normalize_tfeat: + sub_feat = l2_normalize_np_array(sub_feat) + model_inputs["sub_feat"] = torch.from_numpy(sub_feat) + ctx_l = len(sub_feat) + else: + model_inputs["sub_feat"] = torch.zeros((2, 2)) + + if self.use_tef: + ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l + tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l + tef_ed = tef_st + 1.0 / ctx_l + tef = torch.stack([tef_st, tef_ed], dim=1) # (N_clips, 2) + model_inputs["tef_feat"] = tef + else: + model_inputs["tef_feat"] = torch.zeros((2, 2)) + + if self.use_video and self.use_tef: + model_inputs["video_feat"] = torch.cat( + [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D+2) + if self.use_sub and self.use_tef: + model_inputs["sub_feat"] = torch.cat( + [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D_t+2) + return dict(meta=meta, model_inputs=model_inputs) + + +def start_end_collate(batch): + batch_meta = [e["meta"] for e in batch] # seems no need to collate ? + + model_inputs_keys = batch[0]["model_inputs"].keys() + batched_data = dict() + for k in model_inputs_keys: + if "feat" in k: + batched_data[k] = pad_sequences_1d( + [e["model_inputs"][k] for e in batch], dtype=torch.float32, fixed_length=None) + + if "st_ed_indices" in model_inputs_keys: + batched_data["st_ed_indices"] = torch.stack( + [e["model_inputs"]["st_ed_indices"] for e in batch], dim=0) + return batch_meta, batched_data + + +def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False): + model_inputs = {} + for k, v in batched_model_inputs.items(): + if "feat" in k: + model_inputs[k] = v[0].to(device, non_blocking=non_blocking) + model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking) + else: + model_inputs[k] = v.to(device, non_blocking=non_blocking) + return model_inputs + + +if __name__ == '__main__': + from baselines.crossmodal_moment_localization.config import BaseOptions + options = BaseOptions().parse() diff --git a/baselines/crossmodal_moment_localization/train.py b/baselines/crossmodal_moment_localization/train.py new file mode 100644 index 0000000000000000000000000000000000000000..8a79f66e1b5368e9de6bc99d110ec3ab96c8963b --- /dev/null +++ b/baselines/crossmodal_moment_localization/train.py @@ -0,0 +1,226 @@ +import os +import sys +sys.path.append("..") +sys.path.append(".") +import time +import json +import pprint +import random +import numpy as np +from easydict import EasyDict as EDict +from tqdm import tqdm, trange +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from utils.basic_utils import save_json + +from baselines.crossmodal_moment_localization.config import BaseOptions +from baselines.crossmodal_moment_localization.model_xml import XML +from baselines.crossmodal_moment_localization.start_end_dataset import \ + StartEndDataset, start_end_collate, StartEndEvalDataset, prepare_batch_inputs +from baselines.crossmodal_moment_localization.inference import eval_epoch, start_inference +from baselines.crossmodal_moment_localization.optimization import BertAdam +from utils.basic_utils import AverageMeter, get_logger +from utils.model_utils import count_parameters + +def get_eval_data(opt, data_path, data_mode): + dataset = StartEndEvalDataset( + data_path=data_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path if "sub" in opt.ctx_mode else None, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + corpus_path=opt.corpus_path, + vid_feat_path_or_handler=opt.vid_feat_path if "video" in opt.ctx_mode else None, + clip_length=opt.clip_length, + ctx_mode=opt.ctx_mode, + data_mode=data_mode, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat) + return dataset + + +def set_seed(seed, use_cuda=True): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if use_cuda: + torch.cuda.manual_seed_all(seed) + + +def rm_key_from_odict(odict_obj, rm_suffix): + """remove key entry from the OrderedDict""" + return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k]) + + +def train(model, train_dataset, val_data, test_data, context_data, opt, logger): + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + + train_loader = DataLoader(train_dataset, + collate_fn=start_end_collate, + batch_size=opt.bsz, + num_workers=opt.num_workers, + shuffle=True, + pin_memory=opt.pin_memory) + + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01}, + {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0} + ] + + num_train_optimization_steps = len(train_loader) * opt.n_epoch + optimizer = BertAdam(optimizer_grouped_parameters, + lr=opt.lr, + weight_decay=opt.wd, + warmup=opt.lr_warmup_proportion, + t_total=num_train_optimization_steps, + schedule="warmup_linear") + thresholds = [0.3, 0.5, 0.7] + topks = [10, 20, 40] + best_val_ndcg = 0 + for epoch_i in range(0, opt.n_epoch): + print(f"TRAIN EPOCH: {epoch_i}|{opt.n_epoch}") + eval_step = len(train_loader) // opt.eval_num_per_epoch + if opt.hard_negtiave_start_epoch != -1 and epoch_i >= opt.hard_negtiave_start_epoch: + model.set_hard_negative(True, opt.hard_pool_size) + if opt.train_span_start_epoch != -1 and epoch_i >= opt.train_span_start_epoch: + model.set_train_st_ed(opt.lw_st_ed) + + num_training_examples = len(train_loader) + for batch_idx, batch in tqdm(enumerate(train_loader), + desc="Training Iteration", + total=num_training_examples): + global_step = epoch_i * num_training_examples + batch_idx + 1 + model.train(mode=True) + + # continue + model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory) + loss, loss_dict = model(**model_inputs) + optimizer.zero_grad() + loss.backward() + if opt.grad_clip != -1: + nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) + optimizer.step() + + if global_step % eval_step == 0 or batch_idx == len(train_loader): + model.eval() + with torch.no_grad(): + val_performance, val_predictions = eval_epoch(model, val_data, context_data, logger, opt, max_after_nms=40, iou_thds=thresholds, topks=topks) + test_performance, test_predictions = eval_epoch(model, test_data, context_data, logger, opt, max_after_nms=40, iou_thds=thresholds, topks=topks) + logger.info(f"EPOCH: {epoch_i}") + anchor_ndcg = 0 + line1 = "" + line2 = "VAL: " + line3 = "TEST: " + anchor_ndcg = val_performance[20][0.5] + for K, vs in val_performance.items(): + for T, v in vs.items(): + line1 += f"NDCG@{K}, IoU={T}\t" + line2 += f" {v:.6f}" + + for K, vs in test_performance.items(): + for T, v in vs.items(): + line3 += f" {v:.6f}" + logger.info(line1) + logger.info(line2) + logger.info(line3) + + + if anchor_ndcg > best_val_ndcg: + print("~"*40) + save_json(val_predictions, os.path.join(opt.results_dir, "best_val_predictions.json")) + save_json(test_predictions, os.path.join(opt.results_dir, "best_test_predictions.json")) + best_val_ndcg = anchor_ndcg + logger.info("BEST " + line2) + logger.info("BEST " + line3) + checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + logger.info("save checkpoint: {}".format(opt.ckpt_filepath)) + print("~"*40) + + logger.info("") + + + +def main(): + opt = BaseOptions().parse() + set_seed(opt.seed) + logger = get_logger(opt.results_dir, opt.model_name +"_"+ opt.exp_id) + train_dataset = StartEndDataset( + dset_name=opt.dset_name, + data_path=opt.train_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + vid_feat_path_or_handler=opt.vid_feat_path, + clip_length=opt.clip_length, + ctx_mode=opt.ctx_mode, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + ) + + context_data = get_eval_data(opt, opt.val_path, data_mode="context") + val_data = get_eval_data(opt, opt.val_path, data_mode="query") + test_data = get_eval_data(opt, opt.test_path, data_mode="query") + + + + model_config = EDict( + merge_two_stream=not opt.no_merge_two_stream, # merge video and subtitles + cross_att=not opt.no_cross_att, # use cross-attention when encoding video and subtitles + span_predictor_type=opt.span_predictor_type, # span_predictor_type + encoder_type=opt.encoder_type, # gru, lstm, transformer + add_pe_rnn=opt.add_pe_rnn, # add pe for RNNs + pe_type=opt.pe_type, # + visual_input_size=opt.vid_feat_size, + sub_input_size=opt.sub_feat_size, # for both desc and subtitles + query_input_size=opt.q_feat_size, # for both desc and subtitles + hidden_size=opt.hidden_size, # + stack_conv_predictor_conv_kernel_sizes=opt.stack_conv_predictor_conv_kernel_sizes, # + conv_kernel_size=opt.conv_kernel_size, + conv_stride=opt.conv_stride, + max_ctx_l=opt.max_ctx_l, + max_desc_l=opt.max_desc_l, + input_drop=opt.input_drop, + cross_att_drop=opt.cross_att_drop, + drop=opt.drop, + n_heads=opt.n_heads, # self-att heads + initializer_range=opt.initializer_range, # for linear layer + ctx_mode=opt.ctx_mode, # video, sub or video_sub + margin=opt.margin, # margin for ranking loss + ranking_loss_type=opt.ranking_loss_type, # loss type, 'hinge' or 'lse' + lw_neg_q=opt.lw_neg_q, # loss weight for neg. query and pos. context + lw_neg_ctx=opt.lw_neg_ctx, # loss weight for pos. query and neg. context + lw_st_ed=0, # will be assigned dynamically at training time + use_hard_negative=False, # reset at each epoch + hard_pool_size=opt.hard_pool_size, + use_self_attention=not opt.no_self_att, # whether to use self attention + no_modular=opt.no_modular + ) + logger.info("model_config {}".format(model_config)) + model = XML(model_config) + count_parameters(model) + logger.info("Start Training...") + train(model, train_dataset, val_data, test_data, context_data, opt, logger) + + +if __name__ == '__main__': + main() diff --git a/baselines/excl/README.md b/baselines/excl/README.md new file mode 100644 index 0000000000000000000000000000000000000000..936d63ed4ba56c2db49f2dd879fb04a8e103983e --- /dev/null +++ b/baselines/excl/README.md @@ -0,0 +1,25 @@ +# Extractive Clip Localization (ExCL) + +This folder contains the model described in the paper +``` +@article{ghosh2019excl, + title={ExCL: Extractive Clip Localization Using Natural Language Descriptions}, + author={Ghosh, Soham and Agarwal, Anuva and Parekh, Zarana and Hauptmann, Alexander}, + journal={NAACL}, + year={2019} +} +``` + +It also resembles the model in +``` +@article{lei2019tvqa+, + title={TVQA+: Spatio-Temporal Grounding for Video Question Answering}, + author={Lei, Jie and Yu, Licheng and Berg, Tamara L and Bansal, Mohit}, + journal={arXiv preprint arXiv:1904.11574}, + year={2019} +} +``` + +Disclaimer: This code is implemented by [Jie Lei](http://www.cs.unc.edu/~jielei/) for the TVR dataset, +it does not guarantee the reproducibility of the original authors' results. + diff --git a/baselines/excl/__init__.py b/baselines/excl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/baselines/excl/config.py b/baselines/excl/config.py new file mode 100644 index 0000000000000000000000000000000000000000..cad829c2d0634df6df19d47d08eb9b2034279ec2 --- /dev/null +++ b/baselines/excl/config.py @@ -0,0 +1,271 @@ +import os +import time +import torch +import argparse + +from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile +from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs + + +class BaseOptions(object): + saved_option_filename = "opt.json" + ckpt_filename = "model.ckpt" + tensorboard_log_dir = "tensorboard_log" + train_log_filename = "train.log.txt" + eval_log_filename = "eval.log.txt" + + def __init__(self): + self.parser = argparse.ArgumentParser() + self.initialized = False + self.opt = None + + def initialize(self): + self.initialized = True + self.parser.add_argument("--dset_name", type=str, choices=["tvr"]) + self.parser.add_argument("--eval_split_name", type=str, default="val", + help="should match keys in corpus_path, must set for VCMR") + self.parser.add_argument("--debug", action="store_true", + help="debug (fast) mode, break all loops, do not load all data into memory.") + self.parser.add_argument("--data_ratio", type=float, default=1.0, + help="how many training and eval data to use. 1.0: use all, 0.1: use 10%." + "Use small portion for debug purposes. Note this is different from --debug, " + "which works by breaking the loops, typically they are not used together.") + self.parser.add_argument("--results_root", type=str, default="results") + self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training") + self.parser.add_argument("--seed", type=int, default=2018, help="random seed") + self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu") + self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job") + self.parser.add_argument("--num_workers", type=int, default=8, + help="num subprocesses used to load the data, 0: use main process") + self.parser.add_argument("--no_core_driver", action="store_true", + help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`") + self.parser.add_argument("--no_pin_memory", action="store_true", + help="Don't use pin_memory=True for dataloader. " + "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4") + + # training config + self.parser.add_argument("--lr", type=float, default=1e-3, help="learning rate") + self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay") + self.parser.add_argument("--n_epoch", type=int, default=30, help="number of epochs to run") + self.parser.add_argument("--max_es_cnt", type=int, default=10, + help="number of epochs to early stop, use -1 to disable early stop") + self.parser.add_argument("--stop_task", type=str, default="SVMR", choices=["VCMR", "SVMR", "VR"]) + self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+", + default=["SVMR"], choices=["VCMR", "SVMR", "VR"], + help="evaluate and report numbers for tasks specified here.") + self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size") + self.parser.add_argument("--eval_query_bsz", type=int, default=50, + help="mini-batch size at inference, for query") + self.parser.add_argument("--eval_context_bsz", type=int, default=200, + help="mini-batch size at inference, for video/sub") + self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model") + self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable") + self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss") + self.parser.add_argument("--lw_neg_q", type=float, default=1, + help="weight for ranking loss with negative query and positive context") + self.parser.add_argument("--lw_neg_ctx", type=float, default=1, + help="weight for ranking loss with positive query and negative context") + self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss") + self.parser.add_argument("--train_span_start_epoch", type=int, default=0, + help="which epoch to start training span prediction, -1 to disable") + self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"], + help="att loss type, can be hinge loss or its smooth approximation LogSumExp") + self.parser.add_argument("--hard_negtiave_start_epoch", type=int, default=20, + help="which epoch to start hard negative sampling for video-level ranking loss," + "use -1 to disable") + self.parser.add_argument("--hard_pool_size", type=int, default=20, + help="hard negatives are still sampled, but from a harder pool.") + + # Model and Data config + self.parser.add_argument("--max_sub_l", type=int, default=50, + help="max length of all sub sentence 97.71 under 50 for 3 sentences") + self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions") + self.parser.add_argument("--max_ctx_l", type=int, default=100, + help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100") + + self.parser.add_argument("--train_path", type=str, default=None) + self.parser.add_argument("--eval_path", type=str, default=None, + help="Evaluating during training, for Dev set. If None, will only do training, " + "anet_cap and charades_sta has no dev set, so None") + self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features") + self.parser.add_argument("--word2idx_path", type=str, + help="a dict, {word: word_idx, ...}, " + "special tokens are {: 0, : 1, : 2}") + self.parser.add_argument("--vocab_size", type=int, default=-1, + help="Set automatically to len(word2idx)") + self.parser.add_argument("--glove_path", type=str, + help="path to file containing the GloVe embeddings for words in word2idx") + self.parser.add_argument("--desc_bert_path", type=str, default=None) + self.parser.add_argument("--sub_bert_path", type=str, default=None) + self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature") + self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature") + self.parser.add_argument("--ctx_mode", type=str, choices=["video", "sub", "video_sub", "tef", + "video_tef", "sub_tef", "video_sub_tef"], + help="which context to use. a combination of [video, sub, tef]") + self.parser.add_argument("--corpus_path", type=str, default=None) + self.parser.add_argument("--vid_feat_path", type=str, default="") + self.parser.add_argument("--no_norm_vfeat", action="store_true", + help="Do not do normalization on video feat, use it when using i3d_resnet concat feat") + self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat") + self.parser.add_argument("--clip_length", type=float, default=None, + help="each video will be uniformly segmented into small clips, " + "will automatically loaded from ProposalConfigs if None") + self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature") + + self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None, + help="if set, use external video retrieval results to guide evaluation. ") + self.parser.add_argument("--span_predictor_type", type=str, default="conv", choices=["conv", "cat_linear"], + help="how to generate span predictions, " + "conv: apply 1D-Conv layer on top of NxL dot product of query and clips" + "cat_linear: cat the query and clips then use a linear layer to give output. " + "Note cat_linear is implemented as first project query and clips into scores, " + "separately, then sum them up, this should be similar to first cat then project.") + self.parser.add_argument("--encoder_type", type=str, default="transformer", + choices=["gru", "lstm", "transformer", "cnn"]) + self.parser.add_argument("--add_pe_rnn", action="store_true", + help="Add positional encoding for GRU and LSTM encoder as well") + self.parser.add_argument("--no_merge_two_stream", action="store_true", help="do not merge video and subtitles") + self.parser.add_argument("--no_cross_att", action="store_true", + help="Use cross-attention for modeling video and subtitles") + self.parser.add_argument("--no_self_att", action="store_true", help="do not use self attention") + self.parser.add_argument("--no_modular", action="store_true", help="do not use modular attention") + self.parser.add_argument("--pe_type", type=str, default="cosine", choices=["none", "linear", "cosine"], + help="Only for query encoding") + self.parser.add_argument("--max_position_embeddings", type=int, default=300) + self.parser.add_argument("--hidden_size", type=int, default=128) + self.parser.add_argument("--n_heads", type=int, default=4) + self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs") + self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers") + self.parser.add_argument("--cross_att_drop", type=float, default=0.1, help="Applied to cross-att") + self.parser.add_argument("--conv_kernel_size", type=int, default=5) + self.parser.add_argument("--conv_stride", type=int, default=1) + self.parser.add_argument("--initializer_range", type=float, default=0.02, + help="initializer range for linear layer") + + # post processing + self.parser.add_argument("--min_pred_l", type=int, default=2, + help="constrain the [st, ed] with ed - st >= 2" + "(2 clips with length 1.5 each, 3 secs in total" + "this is the min length for proposal-based method)") + self.parser.add_argument("--max_pred_l", type=int, default=16, + help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total" + "(16 clips with length 1.5 each, " + "this is the max length for proposal-based method)") + self.parser.add_argument("--q2c_alpha", type=float, default=20, + help="give more importance to top scored videos' spans, " + "the new score will be: s_new = exp(alpha * s), " + "higher alpha indicates more importance. Note s in [-1, 1]") + + self.parser.add_argument("--max_before_nms", type=int, default=200) + self.parser.add_argument("--max_vcmr_video", type=int, default=100, + help="re-ranking in top-max_vcmr_video") + self.parser.add_argument("--nms_thd", type=float, default=-1, + help="additionally use non-maximum suppression " + "(or non-minimum suppression for distance)" + "to post-processing the predictions. " + "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,") + + def display_save(self, opt): + args = vars(opt) + # Display settings + print("------------ Options -------------\n{}\n-------------------" + .format({str(k): str(v) for k, v in sorted(args.items())})) + + # Save settings + if not isinstance(self, TestOptions): + option_file_path = os.path.join(opt.results_dir, self.saved_option_filename) # not yaml file indeed + save_json(args, option_file_path, save_pretty=True) + + def parse(self): + if not self.initialized: + self.initialize() + opt = self.parser.parse_args() + + if opt.debug: + opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ]) + opt.no_core_driver = True + opt.num_workers = 0 + opt.eval_query_bsz = 100 + + if isinstance(self, TestOptions): + # modify model_dir to absolute path + opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir) + saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename)) + for arg in saved_options: # use saved options to overwrite all BaseOptions args. + if arg not in ["results_root", "num_workers", "nms_thd", "debug", "eval_split_name", + "eval_path", "max_pred_l", "min_pred_l"]: + setattr(opt, arg, saved_options[arg]) + # opt.no_core_driver = True + else: + if opt.exp_id is None: + raise ValueError("--exp_id is required for at a training option!") + + if opt.clip_length is None: + opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"] + print("Loaded clip_length {} from proposal config file".format(opt.clip_length)) + opt.results_dir = os.path.join(opt.results_root, + "-".join([opt.dset_name, opt.ctx_mode, opt.exp_id, + time.strftime("%Y_%m_%d_%H_%M_%S")])) + mkdirp(opt.results_dir) + # save a copy of current code + code_dir = os.path.dirname(os.path.realpath(__file__)) + code_zip_filename = os.path.join(opt.results_dir, "code.zip") + make_zipfile(code_dir, code_zip_filename, + enclosing_dir="code", + exclude_dirs_substring="results", + exclude_dirs=["results", "debug_results", "__pycache__"], + exclude_extensions=[".pyc", ".ipynb", ".swap"],) + + self.display_save(opt) + + if "sub" in opt.ctx_mode: + assert opt.dset_name == "tvr", "sub is only supported for tvr dataset" + + if opt.hard_negtiave_start_epoch != -1: + if opt.hard_pool_size > opt.bsz: + print("[WARNING] hard_pool_size is larger than bsz") + + assert opt.stop_task in opt.eval_tasks_at_training + opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) + opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) + opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) + opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) + opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") + opt.h5driver = None if opt.no_core_driver else "core" + # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5 + opt.num_workers = 1 if opt.no_core_driver else opt.num_workers + opt.pin_memory = not opt.no_pin_memory + + if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d + assert opt.no_norm_vfeat + + if "tef" in opt.ctx_mode and "video" in opt.ctx_mode: + opt.vid_feat_size += 2 + if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode: + opt.sub_feat_size += 2 + + if "video" not in opt.ctx_mode or "sub" not in opt.ctx_mode: + opt.no_merge_two_stream = True + opt.no_cross_att = True + + self.opt = opt + return opt + + +class TestOptions(BaseOptions): + """add additional options for evaluating""" + def initialize(self): + BaseOptions.initialize(self) + # also need to specify --eval_split_name + self.parser.add_argument("--eval_id", type=str, help="evaluation id") + self.parser.add_argument("--model_dir", type=str, + help="dir contains the model file, will be converted to absolute path afterwards") + self.parser.add_argument("--tasks", type=str, nargs="+", + choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"], + help="Which tasks to run." + "VCMR: Video Corpus Moment Retrieval;" + "SVMR: Single Video Moment Retrieval;" + "VR: regular Video Retrieval. (will be performed automatically with VCMR)") diff --git a/baselines/excl/inference.py b/baselines/excl/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..440cf38613b5563f21299c8303a6e9f216b338df --- /dev/null +++ b/baselines/excl/inference.py @@ -0,0 +1,265 @@ +import os +import copy +import math +import pprint +from tqdm import tqdm, trange +import numpy as np + +import torch +import torch.nn.functional as F +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader + +from baselines.excl.config import TestOptions +from baselines.excl.model import EXCL +from baselines.excl.start_end_dataset import \ + start_end_collate, ExCLDataset, prepare_batch_inputs +from baselines.clip_alignment_with_language.inference import \ + get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms +from utils.basic_utils import save_json +from utils.tensor_utils import pad_sequences_1d, find_max_triples, find_max_triples_from_upper_triangle_product +from standalone_eval.eval import eval_retrieval + +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def compute_query2ctx_info_svmr_only(model, eval_dataset, opt, + max_before_nms=1000, max_n_videos=200, tasks=("SVMR",)): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB + max_n_videos: int, use max_n_videos videos for computing VCMR results + """ + model.eval() + query_eval_loader = DataLoader(eval_dataset, + collate_fn=start_end_collate, + batch_size=opt.eval_query_bsz, + num_workers=opt.num_workers, + shuffle=False, + pin_memory=opt.pin_memory) + video2idx = eval_dataset.video2idx + n_total_query = len(eval_dataset) + bsz = opt.eval_query_bsz + ctx_len = eval_dataset.max_ctx_len # all pad to this length + + svmr_gt_st_probs = np.zeros((n_total_query, ctx_len), dtype=np.float32) + svmr_gt_ed_probs = np.zeros((n_total_query, ctx_len), dtype=np.float32) + + query_metas = [] + for idx, batch in tqdm( + enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)): + _query_metas = batch[0] + query_metas.extend(batch[0]) + model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) + _, _, _st_probs, _ed_probs = model(**model_inputs) + # normalize to get true probabilities!!! + # the probabilities here are already (pad) masked, so only need to do softmax + _st_probs = F.softmax(_st_probs, dim=-1) # (_N_q, L) + _ed_probs = F.softmax(_ed_probs, dim=-1) + + svmr_gt_st_probs[idx * bsz:(idx + 1) * bsz, :_st_probs.shape[1]] = _st_probs.cpu().numpy() + svmr_gt_ed_probs[idx * bsz:(idx + 1) * bsz, :_ed_probs.shape[1]] = _ed_probs.cpu().numpy() + + if opt.debug: + break + svmr_res = get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, + query_metas, video2idx, + clip_length=opt.clip_length, + min_pred_l=opt.min_pred_l, + max_pred_l=opt.max_pred_l, + max_before_nms=max_before_nms) + return dict(SVMR=svmr_res) + + +def generate_min_max_length_mask(array_shape, min_l, max_l): + """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked, + below is the case for 4x4. + [[0, 1, 1, 0], + [0, 0, 1, 1], + [0, 0, 0, 1], + [0, 0, 0, 0]] + + Args: + array_shape: np.shape??? The last two dimensions should be the same + min_l: int, minimum length of predicted span + max_l: int, maximum length of predicted span + + Returns: + + """ + single_dims = (1, ) * (len(array_shape) - 2) + mask_shape = single_dims + array_shape[-2:] + extra_length_mask_array = np.ones(mask_shape, dtype=np.float32) # (1, ..., 1, L, L) + mask_triu = np.triu(extra_length_mask_array, k=min_l) + mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l) + final_prob_mask = mask_triu * mask_triu_reversed + return final_prob_mask # with valid bit to be 1 + + +def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx, + clip_length, min_pred_l, max_pred_l, max_before_nms): + """ + Args: + svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1] + svmr_gt_ed_probs: + query_metas: + video2idx: + clip_length: float, how long each clip is in seconds + min_pred_l: int, minimum number of clips + max_pred_l: int, maximum number of clips + max_before_nms: get top-max_before_nms predictions for each query + + Returns: + + """ + svmr_res = [] + query_vid_names = [e["vid_name"] for e in query_metas] + + # masking very long ones! Since most are relatively short. + st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs) # (N, L, L) + # extra_length_mask_array = np.ones(st_ed_prob_product.shape, dtype=bool) # (N, L, L) + # mask_triu = np.triu(extra_length_mask_array, k=min_pred_l) + # mask_triu_reversed = np.logical_not(np.triu(extra_length_mask_array, k=max_pred_l)) + # final_prob_mask = np.logical_and(mask_triu, mask_triu_reversed) # with valid bit to be 1 + valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l) + st_ed_prob_product *= valid_prob_mask # invalid location will become zero! + + batched_sorted_triples = find_max_triples_from_upper_triangle_product( + st_ed_prob_product, top_n=max_before_nms, prob_thd=None) + for i, q_vid_name in tqdm(enumerate(query_vid_names), + desc="[SVMR] Loop over queries to generate predictions", + total=len(query_vid_names)): # i is query_id + q_m = query_metas[i] + video_idx = video2idx[q_vid_name] + _sorted_triples = batched_sorted_triples[i] + _sorted_triples[:, 1] += 1 # as we redefined ed_idx, which is inside the moment. + _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length + # [video_idx(int), st(float), ed(float), score(float)] + cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()] + cur_query_pred = dict( + query_id=q_m["query_id"], + desc=q_m["desc"], + predictions=cur_ranked_predictions + ) + svmr_res.append(cur_query_pred) + return svmr_res + + +def get_eval_res(model, eval_dataset, opt, tasks, max_after_nms): + """compute and save query and video proposal embeddings""" + eval_res = compute_query2ctx_info_svmr_only(model, eval_dataset, opt, + max_before_nms=opt.max_before_nms, + max_n_videos=max_after_nms, + tasks=tasks) + eval_res["video2idx"] = eval_dataset.video2idx + return eval_res + + +POST_PROCESSING_MMS_FUNC = { + "SVMR": post_processing_svmr_nms, + "VCMR": post_processing_vcmr_nms +} + + +def eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=("SVMR",), max_after_nms=100): + """max_after_nms: always set to 100, since the eval script only evaluate top-100""" + model.eval() + logger.info("Computing scores") + eval_submission_raw = get_eval_res(model, eval_dataset, opt, tasks, max_after_nms=max_after_nms) + + IOU_THDS = (0.5, 0.7) + logger.info("Saving/Evaluating before nms results") + submission_path = os.path.join(opt.results_dir, save_submission_filename) + eval_submission = get_submission_top_n(eval_submission_raw, top_n=max_after_nms) + save_json(eval_submission, submission_path) + + metrics = eval_retrieval(eval_submission, eval_dataset.data, + iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug) + save_metrics_path = submission_path.replace(".json", "_metrics.json") + save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False) + latest_file_paths = [submission_path, save_metrics_path] + + if opt.nms_thd != -1: + logger.info("Performing nms with nms_thd {}".format(opt.nms_thd)) + eval_submission_after_nms = dict(video2idx=eval_submission_raw["video2idx"]) + for k, nms_func in POST_PROCESSING_MMS_FUNC.items(): + if k in eval_submission_raw: + eval_submission_after_nms[k] = nms_func(eval_submission_raw[k], + nms_thd=opt.nms_thd, + max_before_nms=opt.max_before_nms, + max_after_nms=max_after_nms) + + logger.info("Saving/Evaluating nms results") + submission_nms_path = submission_path.replace(".json", "_nms_thd_{}.json".format(opt.nms_thd)) + save_json(eval_submission_after_nms, submission_nms_path) + metrics_nms = eval_retrieval(eval_submission_after_nms, eval_dataset.data, + iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug) + save_metrics_nms_path = submission_nms_path.replace(".json", "_metrics.json") + save_json(metrics_nms, save_metrics_nms_path, save_pretty=True, sort_keys=False) + latest_file_paths += [submission_nms_path, save_metrics_nms_path] + else: + metrics_nms = None + return metrics, metrics_nms, latest_file_paths + + +def setup_model(opt): + """Load model from checkpoint and move to specified device""" + checkpoint = torch.load(opt.ckpt_filepath) + model = EXCL(checkpoint["model_cfg"]) + model.load_state_dict(checkpoint["model"]) + logger.info("Loaded model saved at epoch {} from checkpoint: {}" + .format(checkpoint["epoch"], opt.ckpt_filepath)) + + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + return model + + +def start_inference(): + logger.info("Setup config, data and model...") + opt = TestOptions().parse() + cudnn.benchmark = False + cudnn.deterministic = True + + assert opt.eval_path is not None + eval_dataset = ExCLDataset( + dset_name=opt.dset_name, + data_path=opt.eval_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + vid_feat_path_or_handler=opt.vid_feat_path, + clip_length=opt.clip_length, + ctx_mode=opt.ctx_mode, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + corpus_path=opt.corpus_path, + eval_split_name=opt.eval_split_name + ) + + model = setup_model(opt) + save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format( + opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks)) + logger.info("Starting inference...") + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=opt.tasks, max_after_nms=100) + logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + +if __name__ == '__main__': + start_inference() diff --git a/baselines/excl/inference_with_vcmr.py b/baselines/excl/inference_with_vcmr.py new file mode 100644 index 0000000000000000000000000000000000000000..240551e309f8cb26f62f2b401e0109e9587a3ea5 --- /dev/null +++ b/baselines/excl/inference_with_vcmr.py @@ -0,0 +1,253 @@ +import os +import copy +import math +import pprint +from tqdm import tqdm, trange +import numpy as np + +import time +import torch +import torch.nn.functional as F +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader + +from baselines.excl.config import TestOptions +from baselines.excl.model import EXCL +from baselines.excl.start_end_dataset import \ + start_end_collate, ExCLEvalDataset, prepare_batch_inputs +from baselines.clip_alignment_with_language.inference import \ + get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms +from utils.basic_utils import save_json, load_json, flat_list_of_lists +from utils.tensor_utils import pad_sequences_1d, find_max_triples, find_max_triples_from_upper_triangle_product +from standalone_eval.eval import eval_retrieval + +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def load_external_vr_res_with_scores(external_vr_res_path, top_n_vr_videos=5): + """return a mapping from query_id to top retrieved (vid_name, score)""" + external_vr_res = load_json(external_vr_res_path) + external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"] + query2video = {e["query_id"]: [[sub_e[0], sub_e[3]] for sub_e in e["predictions"]] for e in external_vr_res} + return query2video + + +def compute_query2ctx_info(model, eval_dataset, opt, + max_before_nms=1000, max_n_videos=200, tasks=("SVMR",)): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB + max_n_videos: int, use max_n_videos videos for computing VCMR results + """ + model.eval() + eval_dataset.set_data_mode("query") + + logger.info("Using external VR results from {}".format(opt.external_inference_vr_res_path)) + external_query2video = load_external_vr_res_with_scores( + opt.external_inference_vr_res_path, top_n_vr_videos=100) # {query_id: [(vid_name1, score1), ...]} + video2idx = eval_dataset.video2idx + idx2video = {v: k for k, v in video2idx.items()} + vcmr_res = [] + for idx, single_query_data in tqdm(enumerate(eval_dataset), desc="query2ctx", total=len(eval_dataset)): + single_query_meta = single_query_data["meta"] + query_id = single_query_meta["query_id"] + vid_names = [idx2video[e[0]] for e in external_query2video[query_id]] + bsz = len(vid_names) + model_inputs = eval_dataset.get_batched_context(vid_names)[1] + model_inputs["st_ed_indices"] = torch.zeros(bsz, 2).long() + model_inputs["query_feat"] = (single_query_data["model_inputs"]["query_feat"].unsqueeze(0).repeat(bsz, 1, 1), + torch.ones(bsz, len(single_query_data["model_inputs"]["query_feat"]))) + model_inputs = prepare_batch_inputs(model_inputs, device=opt.device, non_blocking=opt.pin_memory) + _, _, _st_probs, _ed_probs = model(**model_inputs) + + # normalize to get true probabilities!!! + # the probabilities here are already (pad) masked, so only need to do softmax + _st_probs = F.softmax(_st_probs, dim=-1) # (_N_q, L) + _ed_probs = F.softmax(_ed_probs, dim=-1) + + vr_scores = _st_probs.new([e[1] for e in external_query2video[query_id]]).unsqueeze(1) # (N, 1) + + _st_probs = _st_probs * torch.exp(opt.q2c_alpha * vr_scores) + + st_ed_prob_product = torch.einsum("bm,bn->bmn", _st_probs, _ed_probs) # (Nq, L, L) + valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, + min_l=opt.min_pred_l, + max_l=opt.max_pred_l) + st_ed_prob_product *= st_ed_prob_product.new(valid_prob_mask) # invalid location will become zero! + + st_ed_prob_product = st_ed_prob_product.cpu().numpy() + batched_sorted_triples = find_max_triples_from_upper_triangle_product( + st_ed_prob_product, top_n=50, prob_thd=None) + # print("batched_sorted_triples", batched_sorted_triples[0][:4]) + # print("[12, ] + batched_sorted_triples[0][0]", [12, ] + batched_sorted_triples[0][0].tolist()) + # print("", batched_sorted_triples[0][0].tolist(), type(batched_sorted_triples[0][0].tolist())) + batched_spans_with_names = [] + for vid_name, b in zip(vid_names, batched_sorted_triples): + cur_video_idx = video2idx[vid_name] + batched_spans_with_names += [[cur_video_idx] + e.tolist() for e in b] + + # print("batched_spans_with_names", len(batched_spans_with_names), batched_spans_with_names[0]) + cur_vcmr_redictions = sorted(batched_spans_with_names, key=lambda x: x[3], reverse=True)[:max_before_nms] + cur_query_pred = dict( + query_id=single_query_meta["query_id"], + desc=single_query_meta["desc"], + predictions=cur_vcmr_redictions) + vcmr_res.append(cur_query_pred) + + if opt.debug and idx == 10: + break + return dict(VCMR=vcmr_res) + + +def generate_min_max_length_mask(array_shape, min_l, max_l): + """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked, + below is the case for 4x4. + [[0, 1, 1, 0], + [0, 0, 1, 1], + [0, 0, 0, 1], + [0, 0, 0, 0]] + + Args: + array_shape: np.shape??? The last two dimensions should be the same + min_l: int, minimum length of predicted span + max_l: int, maximum length of predicted span + + Returns: + + """ + single_dims = (1, ) * (len(array_shape) - 2) + mask_shape = single_dims + array_shape[-2:] + extra_length_mask_array = np.ones(mask_shape, dtype=np.float32) # (1, ..., 1, L, L) + mask_triu = np.triu(extra_length_mask_array, k=min_l) + mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l) + final_prob_mask = mask_triu * mask_triu_reversed + return final_prob_mask # with valid bit to be 1 + + +def get_eval_res(model, eval_dataset, opt, tasks, max_after_nms): + """compute and save query and video proposal embeddings""" + eval_res = compute_query2ctx_info(model, eval_dataset, opt, + max_before_nms=opt.max_before_nms, + max_n_videos=max_after_nms, + tasks=tasks) + eval_res["video2idx"] = eval_dataset.video2idx + return eval_res + + +POST_PROCESSING_MMS_FUNC = { + "SVMR": post_processing_svmr_nms, + "VCMR": post_processing_vcmr_nms +} + + +def eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=("SVMR",), max_after_nms=100): + """max_after_nms: always set to 100, since the eval script only evaluate top-100""" + model.eval() + logger.info("Computing scores") + # logger.info("Start timing") + # times = [] + # for _ in range(3): + # st_time = time.time() + eval_submission_raw = get_eval_res(model, eval_dataset, opt, tasks, max_after_nms=max_after_nms) + # times += [time.time() - st_time] + # times = torch.FloatTensor(times) + + IOU_THDS = (0.5, 0.7) + logger.info("Saving/Evaluating before nms results") + submission_path = os.path.join(opt.results_dir, save_submission_filename) + eval_submission = get_submission_top_n(eval_submission_raw, top_n=max_after_nms) + save_json(eval_submission, submission_path) + + metrics = eval_retrieval(eval_submission, eval_dataset.query_data, + iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug) + # metrics["time_avg"] = float(times.mean()) + # metrics["time_std"] = float(times.std()) + save_metrics_path = submission_path.replace(".json", "_metrics.json") + save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False) + latest_file_paths = [submission_path, save_metrics_path] + + if opt.nms_thd != -1: + logger.info("Performing nms with nms_thd {}".format(opt.nms_thd)) + eval_submission_after_nms = dict(video2idx=eval_submission_raw["video2idx"]) + for k, nms_func in POST_PROCESSING_MMS_FUNC.items(): + if k in eval_submission_raw: + eval_submission_after_nms[k] = nms_func(eval_submission_raw[k], + nms_thd=opt.nms_thd, + max_before_nms=opt.max_before_nms, + max_after_nms=max_after_nms) + + logger.info("Saving/Evaluating nms results") + submission_nms_path = submission_path.replace(".json", "_nms_thd_{}.json".format(opt.nms_thd)) + save_json(eval_submission_after_nms, submission_nms_path) + metrics_nms = eval_retrieval(eval_submission_after_nms, eval_dataset.query_data, + iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug) + save_metrics_nms_path = submission_nms_path.replace(".json", "_metrics.json") + save_json(metrics_nms, save_metrics_nms_path, save_pretty=True, sort_keys=False) + latest_file_paths += [submission_nms_path, save_metrics_nms_path] + else: + metrics_nms = None + return metrics, metrics_nms, latest_file_paths + + +def setup_model(opt): + """Load model from checkpoint and move to specified device""" + checkpoint = torch.load(opt.ckpt_filepath) + model = EXCL(checkpoint["model_cfg"]) + model.load_state_dict(checkpoint["model"]) + logger.info("Loaded model saved at epoch {} from checkpoint: {}" + .format(checkpoint["epoch"], opt.ckpt_filepath)) + + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + return model + + +def start_inference(): + logger.info("Setup config, data and model...") + opt = TestOptions().parse() + cudnn.benchmark = False + cudnn.deterministic = True + assert opt.external_inference_vr_res_path is not None + + assert opt.eval_path is not None + eval_dataset = ExCLEvalDataset( + dset_name=opt.dset_name, + data_path=opt.eval_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + vid_feat_path_or_handler=opt.vid_feat_path, + clip_length=opt.clip_length, + ctx_mode=opt.ctx_mode, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + corpus_path=opt.corpus_path, + eval_split_name=opt.eval_split_name + ) + + model = setup_model(opt) + save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format( + opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks)) + logger.info("Starting inference...") + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=opt.tasks, max_after_nms=100) + logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + +if __name__ == '__main__': + start_inference() diff --git a/baselines/excl/model.py b/baselines/excl/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ab2ef68204674791c02e2f154b0faf9bb822c43b --- /dev/null +++ b/baselines/excl/model.py @@ -0,0 +1,169 @@ +import math +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from utils.model_utils import RNNEncoder +from easydict import EasyDict as edict + + +excl_base_cfg = edict( + visual_input_size=2048, # changes based on visual input type + query_input_size=768, + sub_input_size=768, + hidden_size=256, # + drop=0.5, # dropout for other layers + ctx_mode="video_sub", # which context are used. 'video', 'sub' or 'video_sub' + initializer_range=0.02, +) + + +class EXCL(nn.Module): + def __init__(self, config): + super(EXCL, self).__init__() + self.config = config + self.use_video = "video" in config.ctx_mode + self.use_sub = "sub" in config.ctx_mode + + self.query_encoder = RNNEncoder( + word_embedding_size=config.query_input_size, + hidden_size=config.hidden_size // 2, + bidirectional=True, + n_layers=1, + rnn_type="lstm", + return_outputs=False, + return_hidden=True + ) + + if self.use_video: + self.video_encoder = RNNEncoder( + word_embedding_size=config.visual_input_size, + hidden_size=config.hidden_size // 2, + bidirectional=True, + n_layers=1, + rnn_type="lstm", + return_outputs=True, + return_hidden=False) + + self.video_encoder2 = RNNEncoder( + word_embedding_size=2*config.hidden_size, + hidden_size=config.hidden_size // 2, + bidirectional=True, + n_layers=1, + rnn_type="lstm", + return_outputs=True, + return_hidden=False) + + self.video_st_predictor = nn.Sequential( + nn.Linear(3*config.hidden_size, config.hidden_size), + nn.Tanh(), + nn.Linear(config.hidden_size, 1)) + self.video_ed_predictor = copy.deepcopy(self.video_st_predictor) + + if self.use_sub: + self.sub_encoder = RNNEncoder( + word_embedding_size=config.sub_input_size, + hidden_size=config.hidden_size // 2, + bidirectional=True, + n_layers=1, + rnn_type="lstm", + return_outputs=True, + return_hidden=False) + + self.sub_encoder2 = RNNEncoder( + word_embedding_size=2*config.hidden_size, + hidden_size=config.hidden_size // 2, + bidirectional=True, + n_layers=1, + rnn_type="lstm", + return_outputs=True, + return_hidden=False) + + self.sub_st_predictor = nn.Sequential( + nn.Linear(3*config.hidden_size, config.hidden_size), + nn.Tanh(), + nn.Linear(config.hidden_size, 1)) + self.sub_ed_predictor = copy.deepcopy(self.video_st_predictor) + + self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean") + + self.reset_parameters() + + def reset_parameters(self): + """ Initialize the weights.""" + + def re_init(module): + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + module.reset_parameters() + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + self.apply(re_init) + + def get_prob_single_stream(self, encoded_query, ctx_feat, ctx_mask, module_name=None): + ctx_mask_rnn = ctx_mask.sum(1).long() + ctx_feat1 = getattr(self, module_name+"_encoder")( + F.dropout(ctx_feat, p=self.config.drop, training=self.training), + ctx_mask_rnn)[0] # (N, Lc, D) + ctx_feat2 = getattr(self, module_name+"_encoder2")( + F.dropout(torch.cat([ctx_feat1, encoded_query], dim=-1), p=self.config.drop, training=self.training), + ctx_mask_rnn)[0] # (N, Lc, D) + ctx_feat3 = torch.cat([ctx_feat2, ctx_feat1, encoded_query], dim=2) # (N, Lc, 3D) + st_probs = getattr(self, module_name+"_st_predictor")(ctx_feat3).squeeze() # (N, Lc) + ed_probs = getattr(self, module_name+"_ed_predictor")(ctx_feat3).squeeze() # (N, Lc) + st_probs = mask_logits(st_probs, ctx_mask) + ed_probs = mask_logits(ed_probs, ctx_mask) + return st_probs, ed_probs + + def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, + tef_feat, tef_mask, st_ed_indices, is_training=True): + """ + Args: + query_feat: (N, Lq, Dq) + query_mask: (N, Lq) + video_feat: (N, Lv, Dv) or None + video_mask: (N, Lv) or None + sub_feat: (N, Lv, Ds) or None + sub_mask: (N, Lv) or None + tef_feat: (N, Lv, 2) or None, + tef_mask: (N, Lv) or None, + st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively. + is_training: + """ + query_mask = query_mask.sum(1).long() + encoded_query = self.query_encoder(query_feat, query_mask)[1] # (N, D) + encoded_query = encoded_query.unsqueeze(1).repeat(1, video_feat.shape[1], 1) # (N, Lc, D) + + video_st_prob, video_ed_prob = self.get_prob_single_stream( + encoded_query, video_feat, video_mask, module_name="video") if self.use_video else (0, 0) + + sub_st_prob, sub_ed_prob = self.get_prob_single_stream( + encoded_query, sub_feat, sub_mask, module_name="sub") if self.use_sub else (0, 0) + + st_prob = (video_st_prob + sub_st_prob) / (self.use_video + self.use_sub) + ed_prob = (video_ed_prob + sub_ed_prob) / (self.use_video + self.use_sub) + + if is_training: + loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0]) + loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1]) + loss_st_ed = loss_st + loss_ed + + return loss_st_ed, {"loss_st_ed": float(loss_st_ed)}, st_prob, ed_prob + else: + # used to measure the runtime. not useful for other experiments. + prob_product = torch.einsum("bm,bn->bmn", st_prob, ed_prob) # (N, L, L) + prob_product = torch.triu(prob_product) # () + prob_product = prob_product.view(prob_product.shape[0], -1) + prob_product = torch.topk(prob_product, k=100, dim=1, largest=True) + return None + + +def mask_logits(target, mask): + return target * mask + (1 - mask) * (-1e10) diff --git a/baselines/excl/model_components.py b/baselines/excl/model_components.py new file mode 100644 index 0000000000000000000000000000000000000000..4ab6ba7d99e105c489089877a1f5ef7d630a5f41 --- /dev/null +++ b/baselines/excl/model_components.py @@ -0,0 +1,317 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class DepthwiseSeparableConv(nn.Module): + """ + Depth-wise separable convolution uses less parameters to generate output by convolution. + :Examples: + >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1) + >>> input_tensor = torch.randn(32, 300, 20) + >>> output = m(input_tensor) + """ + + def __init__(self, in_ch, out_ch, k, dim=1, relu=True): + """ + :param in_ch: input hidden dimension size + :param out_ch: output hidden dimension size + :param k: kernel size + :param dim: default 1. 1D conv or 2D conv + """ + super(DepthwiseSeparableConv, self).__init__() + self.relu = relu + if dim == 1: + self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch, + kernel_size=k, groups=in_ch, padding=k//2) + self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch, + kernel_size=1, padding=0) + elif dim == 2: + self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch, + kernel_size=k, groups=in_ch, padding=k//2) + self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch, + kernel_size=1, padding=0) + else: + raise Exception("Incorrect dimension!") + + def forward(self, x): + """ + :Input: (N, L_in, D) + :Output: (N, L_out, D) + """ + x = x.transpose(1, 2) + if self.relu: + out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True) + else: + out = self.pointwise_conv(self.depthwise_conv(x)) + return out.transpose(1, 2) # (N, L, D) + + +class ConvEncoder(nn.Module): + def __init__(self, kernel_size=7, n_filters=128, dropout=0.1): + super(ConvEncoder, self).__init__() + self.dropout = nn.Dropout(dropout) + self.layer_norm = nn.LayerNorm(n_filters) + self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True) + + def forward(self, x, mask): + """ + :param x: (N, L, D) + :param mask: (N, L), is not used. + :return: (N, L, D) + """ + return self.layer_norm(self.dropout(self.conv(x)) + x) # (N, L, D) + + +class TrainablePositionalEncoding(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, max_position_embeddings, hidden_size, dropout=0.1): + super(TrainablePositionalEncoding, self).__init__() + self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) + self.LayerNorm = nn.LayerNorm(hidden_size) + self.dropout = nn.Dropout(dropout) + + def forward(self, input_feat): + """ + Args: + input_feat: (N, L, D) + """ + bsz, seq_length = input_feat.shape[:2] + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device) + position_ids = position_ids.unsqueeze(0).repeat(bsz, 1) # (N, L) + + position_embeddings = self.position_embeddings(position_ids) + + embeddings = self.LayerNorm(input_feat + position_embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class PositionEncoding(nn.Module): + """ + Add positional information to input tensor. + :Examples: + >>> model = PositionEncoding(n_filters=6, max_len=10) + >>> test_input1 = torch.zeros(3, 10, 6) + >>> output1 = model(test_input1) + >>> output1.size() + >>> test_input2 = torch.zeros(5, 3, 9, 6) + >>> output2 = model(test_input2) + >>> output2.size() + """ + + def __init__(self, n_filters=128, max_len=500, pe_type="cosine"): + """ + :param n_filters: same with input hidden size + :param max_len: maximum sequence length + :param pe_type: cosine or linear or None + """ + super(PositionEncoding, self).__init__() + self.pe_type = pe_type + if pe_type != "none": + position = torch.arange(0, max_len).float().unsqueeze(1) + if pe_type == "cosine": + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, n_filters) # (L, D) + div_term = torch.exp(torch.arange(0, n_filters, 2).float() * - (math.log(10000.0) / n_filters)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + elif pe_type == "linear": + pe = position / max_len + else: + raise ValueError + self.register_buffer("pe", pe) # buffer is a tensor, not a variable, (L, D) + + def forward(self, x): + """ + :Input: (*, L, D) + :Output: (*, L, D) the same size as input + """ + if self.pe_type != "none": + pe = self.pe.data[:x.size(-2), :] # (#x.size(-2), n_filters) + extra_dim = len(x.size()) - 2 + for _ in range(extra_dim): + pe = pe.unsqueeze(0) + x = x + pe + return x + + +class LinearLayer(nn.Module): + """linear layer configurable with layer normalization, dropout, ReLU.""" + + def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True): + super(LinearLayer, self).__init__() + self.relu = relu + self.layer_norm = layer_norm + if layer_norm: + self.LayerNorm = nn.LayerNorm(in_hsz) + layers = [ + nn.Dropout(dropout), + nn.Linear(in_hsz, out_hsz) + ] + self.net = nn.Sequential(*layers) + + def forward(self, x): + """(N, L, D)""" + if self.layer_norm: + x = self.LayerNorm(x) + x = self.net(x) + if self.relu: + x = F.relu(x, inplace=True) + return x # (N, L, D) + + +bert_config = dict( + hidden_size=768, + intermediate_size=768, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_attention_heads=4, +) + + +class BertLayer(nn.Module): + def __init__(self, config, use_self_attention=True): + super(BertLayer, self).__init__() + self.use_self_attention = use_self_attention + if use_self_attention: + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask): + """ + Args: + hidden_states: (N, L, D) + attention_mask: (N, L) with 1 indicate valid, 0 indicates invalid + Returns: + + """ + if self.use_self_attention: + attention_output = self.attention(hidden_states, attention_mask) + else: + attention_output = hidden_states + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + + def forward(self, input_tensor, attention_mask): + """ + Args: + input_tensor: (N, L, D) + attention_mask: (N, L) + Returns: + """ + self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) + return attention_output + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Sequential( + nn.Linear(config.hidden_size, config.intermediate_size), + nn.ReLU(True)) + + def forward(self, hidden_states): + return self.dense(hidden_states) + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) # (N, L, nh, dh) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) # (N, nh, L, dh) + + def forward(self, query_states, key_states, value_states, attention_mask): + """ + Args: + query_states: (N, Lq, D) + key_states: (N, L, D) + value_states: (N, L, D) + attention_mask: (N, Lq, L) + Returns: + """ + # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last) + # will be ignored in future computation anyway + attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000. # (N, 1, Lq, L) + mixed_query_layer = self.query(query_states) + mixed_key_layer = self.key(key_states) + mixed_value_layer = self.value(value_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) # (N, nh, Lq, dh) + key_layer = self.transpose_for_scores(mixed_key_layer) # (N, nh, L, dh) + value_layer = self.transpose_for_scores(mixed_value_layer) # (N, nh, L, dh) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) # (N, nh, Lq, L) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + return context_layer + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states diff --git a/baselines/excl/optimization.py b/baselines/excl/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..ac4c3095b2f07ef688c450d493c889ca459856ad --- /dev/null +++ b/baselines/excl/optimization.py @@ -0,0 +1,338 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for BERT model.""" + +import math +import torch +from torch.optim import Optimizer +from torch.optim.optimizer import required +from torch.nn.utils import clip_grad_norm_ +import logging +import abc +import sys + +logger = logging.getLogger(__name__) + + +if sys.version_info >= (3, 4): + ABC = abc.ABC +else: + ABC = abc.ABCMeta('ABC', (), {}) + + +class _LRSchedule(ABC): + """ Parent of all LRSchedules here. """ + warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense + def __init__(self, warmup=0.002, t_total=-1, **kw): + """ + :param warmup: what fraction of t_total steps will be used for linear warmup + :param t_total: how many training steps (updates) are planned + :param kw: + """ + super(_LRSchedule, self).__init__(**kw) + if t_total < 0: + logger.warning("t_total value of {} results in schedule not being applied".format(t_total)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + warmup = max(warmup, 0.) + self.warmup, self.t_total = float(warmup), float(t_total) + self.warned_for_t_total_at_progress = -1 + + def get_lr(self, step, nowarn=False): + """ + :param step: which of t_total steps we're on + :param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps + :return: learning rate multiplier for current update + """ + if self.t_total < 0: + return 1. + progress = float(step) / self.t_total + ret = self.get_lr_(progress) + # warning for exceeding t_total (only active with warmup_linear + if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress: + logger.warning( + "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly." + .format(ret, self.__class__.__name__)) + self.warned_for_t_total_at_progress = progress + # end warning + return ret + + @abc.abstractmethod + def get_lr_(self, progress): + """ + :param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress + :return: learning rate multiplier for current update + """ + return 1. + + +class ConstantLR(_LRSchedule): + def get_lr_(self, progress): + return 1. + + +class WarmupCosineSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve. + If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. + """ + warn_t_total = True + def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): + """ + :param warmup: see LRSchedule + :param t_total: see LRSchedule + :param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1. + :param kw: + """ + super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw) + self.cycles = cycles + + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + + +class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying + learning rate (with hard restarts). + """ + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + assert(cycles >= 1.) + + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1))) + return ret + + +class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule): + """ + All training progress is divided in `cycles` (default=1.) parts of equal length. + Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1., + followed by a learning rate decreasing from 1. to 0. following a cosine curve. + """ + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + assert(warmup * cycles < 1.) + warmup = warmup * cycles if warmup >= 0 else warmup + super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + + def get_lr_(self, progress): + progress = progress * self.cycles % 1. + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * progress)) + return ret + + +class WarmupConstantSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Keeps learning rate equal to 1. after warmup. + """ + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return 1. + + +class WarmupLinearSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps. + """ + warn_t_total = True + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return max((progress - 1.) / (self.warmup - 1.), 0.) + + +SCHEDULES = { + None: ConstantLR, + "none": ConstantLR, + "warmup_cosine": WarmupCosineSchedule, + "warmup_constant": WarmupConstantSchedule, + "warmup_linear": WarmupLinearSchedule +} + + +class EMA(object): + """ Exponential Moving Average for model parameters. + references: + [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py + [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py""" + def __init__(self, decay): + self.decay = decay + self.shadow = {} + self.original = {} + + def register(self, name, val): + self.shadow[name] = val.clone() + + def __call__(self, model, step): + decay = min(self.decay, (1 + step) / (10.0 + step)) + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + new_average = \ + (1.0 - decay) * param.data + decay * self.shadow[name] + self.shadow[name] = new_average.clone() + + def assign(self, model): + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + self.original[name] = param.data.clone() + param.data = self.shadow[name] + + def resume(self, model): + for name, param in model.named_parameters(): + if param.requires_grad: + assert name in self.shadow + param.data = self.original[name] + + +class BertAdam(Optimizer): + """Implements BERT version of Adam algorithm with weight decay fix. + Params: + lr: learning rate + warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 + t_total: total number of training steps for the learning + rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1 + schedule: schedule to use for the warmup (see above). + Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below). + If `None` or `'none'`, learning rate is always kept constant. + Default : `'warmup_linear'` + b1: Adams b1. Default: 0.9 + b2: Adams b2. Default: 0.999 + e: Adams epsilon. Default: 1e-6 + weight_decay: Weight decay. Default: 0.01 + max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 + """ + def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', + b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs): + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) + if not e >= 0.0: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) + # initialize schedule object + if not isinstance(schedule, _LRSchedule): + schedule_type = SCHEDULES[schedule] + schedule = schedule_type(warmup=warmup, t_total=t_total) + else: + if warmup != -1 or t_total != -1: + logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. " + "Please specify custom warmup and t_total in _LRSchedule object.") + defaults = dict(lr=lr, schedule=schedule, + b1=b1, b2=b2, e=e, weight_decay=weight_decay, + max_grad_norm=max_grad_norm) + super(BertAdam, self).__init__(params, defaults) + + def get_lr(self): + lr = [] + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + if len(state) == 0: + return [0] + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + lr.append(lr_scheduled) + return lr + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['next_m'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['next_v'] = torch.zeros_like(p.data) + + next_m, next_v = state['next_m'], state['next_v'] + beta1, beta2 = group['b1'], group['b2'] + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + next_m.mul_(beta1).add_(1 - beta1, grad) + next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) + update = next_m / (next_v.sqrt() + group['e']) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want to decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if group['weight_decay'] > 0.0: + update += group['weight_decay'] * p.data + + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + + update_with_lr = lr_scheduled * update + p.data.add_(-update_with_lr) + + state['step'] += 1 + + # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 + # No bias correction + # bias_correction1 = 1 - beta1 ** state['step'] + # bias_correction2 = 1 - beta2 ** state['step'] + + return loss diff --git a/baselines/excl/scripts/eval.sh b/baselines/excl/scripts/eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..8b506bb56df447c8c67fa84e6927de4e75f2613e --- /dev/null +++ b/baselines/excl/scripts/eval.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/modular_moment_localization/scripts/inference.sh ANY_OTHER_PYTHON_ARGS +eval_split_name=$1 +submission_path=$2 +save_path=$3 +gt_path=data/tvr_${eval_split_name}_release.jsonl + +python standalone_eval/eval.py \ +-gt_path ${gt_path} \ +-submission_path ${submission_path} \ +-save_path ${save_path} \ +${@:4} diff --git a/baselines/excl/scripts/inference.sh b/baselines/excl/scripts/inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..db796fbca326987f8cc6bdb5c0d71acb327b38a7 --- /dev/null +++ b/baselines/excl/scripts/inference.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/excl/scripts/inference.sh ANY_OTHER_PYTHON_ARGS +model_dir=$1 +eval_split_name=$2 +eval_path=data/tvr_${eval_split_name}_release.jsonl +tasks=() +tasks+=(VCMR) +tasks+=(SVMR) +tasks+=(VR) +echo "tasks ${tasks[@]}" +python baselines/excl/inference.py \ +--model_dir ${model_dir} \ +--tasks ${tasks[@]} \ +--eval_split_name ${eval_split_name} \ +--eval_path ${eval_path} \ +${@:3} diff --git a/baselines/excl/scripts/inference_with_vcmr.sh b/baselines/excl/scripts/inference_with_vcmr.sh new file mode 100644 index 0000000000000000000000000000000000000000..87ac8cd0c5e2313bb8227731ebb174c2c43fe288 --- /dev/null +++ b/baselines/excl/scripts/inference_with_vcmr.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/excl/scripts/inference.sh ANY_OTHER_PYTHON_ARGS +model_dir=$1 +eval_split_name=$2 +eval_path=data/tvr_${eval_split_name}_release.jsonl +tasks=() +tasks+=(VCMR) + +project_root=./baselines +external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39 +external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json + + +echo "tasks ${tasks[@]}" +python baselines/excl/inference_with_vcmr.py \ +--model_dir ${model_dir} \ +--tasks ${tasks[@]} \ +--eval_split_name ${eval_split_name} \ +--external_inference_vr_res_path ${external_inference_vr_res_path} \ +--eval_id ${external_model_dir} \ +--eval_path ${eval_path} \ +${@:3} diff --git a/baselines/excl/scripts/train.sh b/baselines/excl/scripts/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..ed851ea8d3a7a34a058898ffced532e36dd7cfcc --- /dev/null +++ b/baselines/excl/scripts/train.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/excl/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS +# use --eval_tasks_at_training ["VR", "SVMR", "VCMR"] --stop_task ["VR", "SVMR", "VCMR"] for +# use --lw_neg_q 0 --lw_neg_ctx 0 for training SVMR/SVMR only +# use --lw_st_ed 0 for training with VR only +dset_name=$1 # see case below +ctx_mode=$2 # ["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"] +vid_feat_type=$3 # [resnet, i3d, resnet_i3d] +feature_root=data/tvr_feature_release +results_root=baselines/excl/results +vid_feat_size=2048 +extra_args=() + +if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + if [[ ${dset_name} != "tvr" ]]; then + echo "The use of subtitles is only supported in tvr." + exit 1 + fi +fi + + +case ${dset_name} in + tvr) + train_path=data/tvr_train_release.jsonl + corpus_path=data/tvr_video2dur_idx.json + desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5 + if [[ ${vid_feat_type} == "i3d" ]]; then + echo "Using I3D feature with shape 1024" + vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 + vid_feat_size=1024 + elif [[ ${vid_feat_type} == "resnet" ]]; then + echo "Using ResNet feature with shape 2048" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5 + vid_feat_size=2048 + elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then + echo "Using concatenated ResNet and I3D feature with shape 2048+1024" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5 + vid_feat_size=3072 + extra_args+=(--no_norm_vfeat) # since they are already normalized. + fi + eval_split_name=val + nms_thd=-1 + extra_args+=(--eval_path) + extra_args+=(data/tvr_val_release.jsonl) + clip_length=1.5 + extra_args+=(--max_ctx_l) + extra_args+=(100) # max_ctx_l = 100 for clip_length = 1.5, only ~109/21825 has more than 100. + extra_args+=(--max_pred_l) + extra_args+=(16) + if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + echo "Running with sub." + desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5 # overwrite + sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5 + sub_feat_size=768 + extra_args+=(--sub_feat_size) + extra_args+=(${sub_feat_size}) + extra_args+=(--sub_bert_path) + extra_args+=(${sub_bert_path}) + fi + ;; + *) + echo -n "Unknown argument" + ;; +esac + +echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]" +echo "Extra args ${extra_args[@]}" +python baselines/excl/train.py \ +--dset_name=${dset_name} \ +--eval_split_name=${eval_split_name} \ +--nms_thd=${nms_thd} \ +--results_root=${results_root} \ +--train_path=${train_path} \ +--desc_bert_path=${desc_bert_path} \ +--corpus_path=${corpus_path} \ +--vid_feat_path=${vid_feat_path} \ +--clip_length=${clip_length} \ +--vid_feat_size=${vid_feat_size} \ +--ctx_mode=${ctx_mode} \ +${extra_args[@]} \ +${@:4} diff --git a/baselines/excl/start_end_dataset.py b/baselines/excl/start_end_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..158c0b6144bf0ec14de46daad47c23a5623015f1 --- /dev/null +++ b/baselines/excl/start_end_dataset.py @@ -0,0 +1,380 @@ +""" +Dataset for clip model +""" +import logging +import torch +from torch.utils.data import Dataset +import numpy as np +import h5py +import time +import math +import random +from tqdm import tqdm +from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array, flat_list_of_lists, merge_dicts +from utils.tensor_utils import pad_sequences_1d +from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \ + get_didemo_agreed_ts + +logger = logging.getLogger(__name__) + + +class ExCLDataset(Dataset): + """ + Args: + dset_name, str, ["tvr"] + ctx_mode: str, + Return: + a dict: { + "meta": { + "query_id": int, + "desc": str, + "vid_name": str, + "duration": float, + "ts": [st (float), ed (float)], seconds, ground_truth timestamps + } + "model_inputs": { + "query_feat": torch.tensor, (L, D_q) + "video_feat": torch.tensor, (n_clip_in_moment, D_video) + "sub_feat": torch.tensor, (n_clip_in_moment, D_sub) + "st_ed_indices": torch.LongTensor, (2, ) + } + } + """ + def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, + max_desc_len, max_ctx_len, + vid_feat_path_or_handler, clip_length, ctx_mode="video", + normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0, + corpus_path=None, eval_split_name=None): + self.dset_name = dset_name + self.data_path = data_path + self.data_ratio = data_ratio + + self.desc_bert_path_or_handler = desc_bert_path_or_handler + self.max_desc_len = max_desc_len + + self.sub_bert_path_or_handler = sub_bert_path_or_handler + self.max_ctx_len = max_ctx_len + self.vid_feat_path_or_handler = vid_feat_path_or_handler + self.clip_length = clip_length + self.ctx_mode = ctx_mode + + # prepare desc data + self.data = load_jsonl(data_path) + if self.data_ratio != 1: + n_examples = int(len(self.data) * data_ratio) + self.data = self.data[:n_examples] + logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) + + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if self.use_video: + if isinstance(vid_feat_path_or_handler, h5py.File): + self.vid_feat_h5 = vid_feat_path_or_handler + else: # str path + self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) + + if isinstance(desc_bert_path_or_handler, h5py.File): + self.desc_bert_h5 = desc_bert_path_or_handler + else: + self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) + + if self.use_sub: + if isinstance(sub_bert_path_or_handler, h5py.File): + self.sub_bert_h5 = sub_bert_path_or_handler + else: # str path + self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) + + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + + if corpus_path is not None: + video_data = load_json(corpus_path)[eval_split_name] + self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] + self.video2idx = {k: v[1] for k, v in video_data.items()} + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + raw_data = self.data[index] + + # initialize with basic data + meta = dict( + query_id=raw_data["query_id"], + desc=raw_data["desc"], + vid_name=raw_data["vid_name"], + duration=raw_data["duration"], + ts=raw_data["ts"] if self.dset_name != "didemo" else get_didemo_agreed_ts(raw_data["ts"]), + ) + model_inputs = dict() + model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"]) + + ctx_l = 0 + if self.use_video: + video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clip, D) + if self.normalize_vfeat: + video_feat = l2_normalize_np_array(video_feat) + model_inputs["video_feat"] = torch.from_numpy(video_feat) + ctx_l = len(video_feat) + else: + model_inputs["video_feat"] = torch.zeros((2, 2)) + + if self.use_sub: # no need for ctx feature, as the features are already contextulized + sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clips, D_t) + if self.normalize_tfeat: + sub_feat = l2_normalize_np_array(sub_feat) + model_inputs["sub_feat"] = torch.from_numpy(sub_feat) + ctx_l = len(sub_feat) + else: + model_inputs["sub_feat"] = torch.zeros((2, 2)) + + if self.use_tef: + # note the tef features here are normalized clip indices (1.5 secs), instead of the original time (1 sec) + ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l + tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l + tef_ed = tef_st + 1.0 / ctx_l + tef = torch.stack([tef_st, tef_ed], dim=1) # (N_clips, 2) + model_inputs["tef_feat"] = tef + else: + model_inputs["tef_feat"] = torch.zeros((2, 2)) + + if self.use_video and self.use_tef: + model_inputs["video_feat"] = torch.cat( + [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D+2) + if self.use_sub and self.use_tef: + model_inputs["sub_feat"] = torch.cat( + [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D_t+2) + + model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], max_idx=ctx_l-1) + return dict(meta=meta, model_inputs=model_inputs) + + def get_st_ed_label(self, ts, max_idx): + """ + Args: + ts: [st (float), ed (float)] in seconds, ed > st + max_idx: length of the video + + Returns: + [st_idx, ed_idx]: int, + + Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6, + clips should be indexed as [2: 6), the translated back ts should be [3:9]. + # TODO which one is better, [2: 5] or [2: 6) + """ + st_idx = min(math.floor(ts[0] / self.clip_length), max_idx) + ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx) + return torch.LongTensor([st_idx, ed_idx]) + + def get_query_feat_by_query_id(self, query_id): + query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + return torch.from_numpy(query_feat) + + +class ExCLEvalDataset(Dataset): + """ + init_data_mode: `video_query` or `video_only` or `query_only`, + it indicates which data to load when initialize the Dataset object. + data_mode: `context` or `query`, it indicates which data to return for self.__get_item__() + desc_bert_path_or_handler: h5py.File object or str path + vid_feat_path_or_handler: h5py.File object or str path + eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with + max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals. + load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval. + data_ratio: percentage of query data to use. + """ + def __init__(self, dset_name, eval_split_name, data_path=None, + desc_bert_path_or_handler=None, max_desc_len=None, max_ctx_len=None, + sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, + corpus_path=None, clip_length=None, + ctx_mode="video", data_mode="context", + h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True): + self.dset_name = dset_name + self.eval_split_name = eval_split_name + self.ctx_mode = ctx_mode + self.load_gt_video = False + self.data_ratio = data_ratio # only affect query data + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + + self.data_mode = None + self.set_data_mode(data_mode) + + self.max_desc_len = max_desc_len + self.max_ctx_len = max_ctx_len + self.data_path = data_path + self.query_data = load_jsonl(data_path) + if data_ratio != 1: + n_examples = int(len(self.query_data) * data_ratio) + self.query_data = self.query_data[:n_examples] + logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) + if isinstance(desc_bert_path_or_handler, h5py.File): + self.desc_bert_h5 = desc_bert_path_or_handler + else: + self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) + + video_data = load_json(corpus_path)[self.eval_split_name] + self.video_data = {k: v[0] for k, v in video_data.items()} + self.video2idx = {k: v[1] for k, v in video_data.items()} + self.clip_length = clip_length + + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if self.use_video: + if isinstance(vid_feat_path_or_handler, h5py.File): + self.vid_feat_h5 = vid_feat_path_or_handler + else: # str path + self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) + + if self.use_sub: + if isinstance(sub_bert_path_or_handler, h5py.File): + self.sub_bert_h5 = sub_bert_path_or_handler + else: # str path + self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) + + def set_data_mode(self, data_mode): + """context or query""" + assert data_mode in ["context", "query"] + self.data_mode = data_mode + + def load_gt_vid_name_for_query(self, load_gt_video): + """load_gt_video: bool, affect the returned value of self._get_item_query""" + assert "vid_name" in self.query_data[0] + self.load_gt_video = load_gt_video + + def __len__(self): + if self.data_mode == "context": + return len(self.video_data) + else: + return len(self.query_data) + + def __getitem__(self, index): + return self._get_item_query(index) + + def get_query_feat_by_query_id(self, query_id): + query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + return torch.from_numpy(query_feat) + + def _get_item_query(self, index): + """Need to batch""" + raw_data = self.query_data[index] + + meta = dict( + query_id=raw_data["query_id"], + desc=raw_data["desc"], + vid_name=raw_data["vid_name"] if self.load_gt_video else None + ) + + model_inputs = dict() + model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"]) + return dict(meta=meta, model_inputs=model_inputs) + + def get_st_ed_label(self, ts, max_idx): + """ + Args: + ts: [st (float), ed (float)] in seconds, ed > st + max_idx: length of the video + + Returns: + [st_idx, ed_idx]: int, + + Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6, + clips should be indexed as [2: 6), the translated back ts should be [3:9]. + Given ts = [5, 9], st_idx = 3, ed_idx = 6, + clips should be indexed as [3: 6), the translated back ts should be [4.5:9]. + # TODO which one is better, [2: 5] or [2: 6) + """ + # TODO ed_idx -= 1, should also modify relevant code in inference.py + st_idx = min(math.floor(ts[0] / self.clip_length), max_idx) + ed_idx = min(math.ceil(ts[1] / self.clip_length) - 1, max_idx) # st_idx could be the same as ed_idx + return torch.LongTensor([st_idx, ed_idx]) + + def get_batched_context(self, vid_names): + batch = [self._get_item_context_by_vid_name(e) for e in vid_names] + metas, model_inputs = start_end_collate(batch) + return metas, model_inputs + + def _get_item_context_by_vid_name(self, vid_name): + """No need to batch, since it has already been batched here""" + # initialize with basic data + meta = dict( + vid_name=vid_name, + duration=self.video_data[vid_name], + ) + + model_inputs = dict() + ctx_l = 0 + + if self.use_video: + video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clip, D) + if self.normalize_vfeat: + video_feat = l2_normalize_np_array(video_feat) + model_inputs["video_feat"] = torch.from_numpy(video_feat) + ctx_l = len(video_feat) + else: + model_inputs["video_feat"] = torch.zeros((2, 2)) + + if self.use_sub: # no need for ctx feature, as the features are already contextulized + sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len] # (N_clips, D_t) + if self.normalize_tfeat: + sub_feat = l2_normalize_np_array(sub_feat) + model_inputs["sub_feat"] = torch.from_numpy(sub_feat) + ctx_l = len(sub_feat) + else: + model_inputs["sub_feat"] = torch.zeros((2, 2)) + + if self.use_tef: + ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l + tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l + tef_ed = tef_st + 1.0 / ctx_l + tef = torch.stack([tef_st, tef_ed], dim=1) # (N_clips, 2) + model_inputs["tef_feat"] = tef + else: + model_inputs["tef_feat"] = torch.zeros((2, 2)) + + if self.use_video and self.use_tef: + model_inputs["video_feat"] = torch.cat( + [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D+2) + if self.use_sub and self.use_tef: + model_inputs["sub_feat"] = torch.cat( + [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1) # (N_clips, D_t+2) + return dict(meta=meta, model_inputs=model_inputs) + + +def start_end_collate(batch): + batch_meta = [e["meta"] for e in batch] # seems no need to collate ? + + model_inputs_keys = batch[0]["model_inputs"].keys() + batched_data = dict() + for k in model_inputs_keys: + if "feat" in k: + batched_data[k] = pad_sequences_1d( + [e["model_inputs"][k] for e in batch], dtype=torch.float32, fixed_length=None) + + if "st_ed_indices" in model_inputs_keys: + batched_data["st_ed_indices"] = torch.stack( + [e["model_inputs"]["st_ed_indices"] for e in batch], dim=0) + return batch_meta, batched_data + + +def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False): + model_inputs = {} + for k, v in batched_model_inputs.items(): + if "feat" in k: + model_inputs[k] = v[0].to(device, non_blocking=non_blocking) + model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking) + else: + model_inputs[k] = v.to(device, non_blocking=non_blocking) + return model_inputs + + +if __name__ == '__main__': + from baselines.crossmodal_moment_localization.config import BaseOptions + options = BaseOptions().parse() diff --git a/baselines/excl/train.py b/baselines/excl/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ac40adae00163564842d774c32b3884a5e16e7df --- /dev/null +++ b/baselines/excl/train.py @@ -0,0 +1,305 @@ +import os +import time +import json +import pprint +import random +import numpy as np +from easydict import EasyDict as EDict +from tqdm import tqdm, trange +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from baselines.excl.config import BaseOptions +from baselines.excl.model import EXCL +from baselines.excl.start_end_dataset import \ + ExCLDataset, start_end_collate, prepare_batch_inputs +from baselines.excl.inference import eval_epoch, start_inference +from utils.basic_utils import AverageMeter +from utils.model_utils import count_parameters + + +import logging +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def set_seed(seed, use_cuda=True): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if use_cuda: + torch.cuda.manual_seed_all(seed) + + +def train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True): + logger.info("use train_epoch func for training: {}".format(training)) + model.train(mode=training) + + # init meters + dataloading_time = AverageMeter() + prepare_inputs_time = AverageMeter() + model_forward_time = AverageMeter() + model_backward_time = AverageMeter() + loss_meters = OrderedDict(loss_st_ed=AverageMeter()) + + num_training_examples = len(train_loader) + timer_dataloading = time.time() + for batch_idx, batch in tqdm(enumerate(train_loader), + desc="Training Iteration", + total=num_training_examples): + global_step = epoch_i * num_training_examples + batch_idx + dataloading_time.update(time.time() - timer_dataloading) + + # continue + timer_start = time.time() + model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory) + prepare_inputs_time.update(time.time() - timer_start) + # logger.info("model_inputs {}" + # .format({k: (type(k), v.shape if isinstance(v, torch.Tensor) else v) + # for k, v in model_inputs.items()})) + # logger.info("model_inputs \n{}".format({k: (type(v), v.shape, v.dtype) for k, v in model_inputs.items()})) + timer_start = time.time() + loss, loss_dict, _, _ = model(**model_inputs) + model_forward_time.update(time.time() - timer_start) + timer_start = time.time() + if training: + optimizer.zero_grad() + loss.backward() + if opt.grad_clip != -1: + nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) + optimizer.step() + model_backward_time.update(time.time() - timer_start) + + opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step) + for k, v in loss_dict.items(): + opt.writer.add_scalar("Train/{}".format(k), v, global_step) + + for k, v in loss_dict.items(): + loss_meters[k].update(float(v)) + + timer_dataloading = time.time() + if opt.debug and batch_idx == 3: + break + + if training: + to_write = opt.train_log_txt_formatter.format( + time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), + epoch=epoch_i, + loss_str=" ".join(["{} {:.4f}".format(k, v.avg) for k, v in loss_meters.items()])) + with open(opt.train_log_filepath, "a") as f: + f.write(to_write) + print("Epoch time stats:") + print("dataloading_time: max {dataloading_time.max} " + "min {dataloading_time.min} avg {dataloading_time.avg}\n" + "prepare_inputs_time: max {prepare_inputs_time.max} " + "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n" + "model_forward_time: max {model_forward_time.max} " + "min {model_forward_time.min} avg {model_forward_time.avg}\n" + "model_backward_time: max {model_backward_time.max} " + "min {model_backward_time.min} avg {model_backward_time.avg}\n" + "".format(dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time, + model_forward_time=model_forward_time, model_backward_time=model_backward_time)) + else: + for k, v in loss_meters.items(): + opt.writer.add_scalar("Eval_Loss/{}".format(k), v.avg, epoch_i) + + +def rm_key_from_odict(odict_obj, rm_suffix): + """remove key entry from the OrderedDict""" + return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k]) + + +def train(model, train_dataset, val_dataset, opt): + # Prepare optimizer + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + + train_loader = DataLoader(train_dataset, + collate_fn=start_end_collate, + batch_size=opt.bsz, + num_workers=opt.num_workers, + shuffle=True, + pin_memory=opt.pin_memory) + + # Prepare optimizer + optimizer = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=opt.lr) + + prev_best_score = 0. + es_cnt = 0 + start_epoch = -1 if opt.eval_untrained else 0 + eval_tasks_at_training = opt.eval_tasks_at_training # VR is computed along with VCMR + save_submission_filename = \ + "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, "_".join(eval_tasks_at_training)) + for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"): + if epoch_i > -1: + with torch.autograd.detect_anomaly(): + train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True) + global_step = (epoch_i + 1) * len(train_loader) + if opt.eval_path is not None: + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, val_dataset, opt, save_submission_filename, + tasks=eval_tasks_at_training, max_after_nms=100) + to_write = opt.eval_log_txt_formatter.format( + time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), + epoch=epoch_i, + eval_metrics_str=json.dumps(metrics_no_nms)) + with open(opt.eval_log_filepath, "a") as f: + f.write(to_write) + logger.info("metrics_no_nms {}".format(pprint.pformat(rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4))) + logger.info("metrics_nms {}".format(pprint.pformat(metrics_nms, indent=4))) + + # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms + metrics = metrics_no_nms + # early stop/ log / save model + for task_type in ["SVMR", "VCMR"]: + if task_type in metrics: + task_metrics = metrics[task_type] + for iou_thd in [0.5, 0.7]: + opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd), + {k: v for k, v in task_metrics.items() if str(iou_thd) in k}, + global_step) + + task_type = "VR" + if task_type in metrics: + task_metrics = metrics[task_type] + opt.writer.add_scalars("Eval/{}".format(task_type), + {k: v for k, v in task_metrics.items()}, + global_step) + + # use the most strict metric available + stop_metric_names = ["r1"] if opt.stop_task == "VR" else ["0.5-r1", "0.7-r1"] + stop_score = sum([metrics[opt.stop_task][e] for e in stop_metric_names]) + + if stop_score > prev_best_score: + es_cnt = 0 + prev_best_score = stop_score + + checkpoint = { + "model": model.state_dict(), + "model_cfg": model.config, + "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + + best_file_paths = [e.replace("latest", "best") for e in latest_file_paths] + for src, tgt in zip(latest_file_paths, best_file_paths): + os.renames(src, tgt) + logger.info("The checkpoint file has been updated.") + else: + es_cnt += 1 + if opt.max_es_cnt != -1 and es_cnt > opt.max_es_cnt: # early stop + with open(opt.train_log_filepath, "a") as f: + f.write("Early Stop at epoch {}".format(epoch_i)) + logger.info("Early stop at {} with {} {}" + .format(epoch_i, " ".join([opt.stop_task] + stop_metric_names), prev_best_score)) + break + else: + checkpoint = { + "model": model.state_dict(), + "model_cfg": model.config, + "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + + if opt.debug: + break + + opt.writer.close() + + +def start_training(): + logger.info("Setup config, data and model...") + opt = BaseOptions().parse() + set_seed(opt.seed) + if opt.debug: # keep the model run deterministically + # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config. + # Enable this only when input size is fixed. + cudnn.benchmark = False + cudnn.deterministic = True + + opt.writer = SummaryWriter(opt.tensorboard_log_dir) + opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" + opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" + + train_dataset = ExCLDataset( + dset_name=opt.dset_name, + data_path=opt.train_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + vid_feat_path_or_handler=opt.vid_feat_path, + clip_length=opt.clip_length, + ctx_mode=opt.ctx_mode, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + ) + + if opt.eval_path is not None: + eval_dataset = ExCLDataset( + dset_name=opt.dset_name, + data_path=opt.eval_path, + desc_bert_path_or_handler=train_dataset.desc_bert_h5, + sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, + clip_length=opt.clip_length, + ctx_mode=opt.ctx_mode, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + corpus_path=opt.corpus_path, + eval_split_name=opt.eval_split_name + ) + else: + eval_dataset = None + + model_config = EDict( + visual_input_size=opt.vid_feat_size, + sub_input_size=opt.sub_feat_size, # for both desc and subtitles + query_input_size=opt.q_feat_size, # for both desc and subtitles + hidden_size=opt.hidden_size, + drop=opt.drop, + ctx_mode=opt.ctx_mode, # video, sub or video_sub + initializer_range=opt.initializer_range + ) + logger.info("model_config {}".format(model_config)) + model = EXCL(model_config) + count_parameters(model) + logger.info("Start Training...") + train(model, train_dataset, eval_dataset, opt) + return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug + + +if __name__ == '__main__': + model_dir, eval_split_name, eval_path, debug = start_training() + if not debug: + model_dir = model_dir.split(os.sep)[-1] + tasks = ["SVMR"] + input_args = ["--model_dir", model_dir, + "--eval_split_name", eval_split_name, + "--eval_path", eval_path, + "--tasks"] + tasks + + import sys + sys.argv[1:] = input_args + logger.info("\n\n\nFINISHED TRAINING!!!") + logger.info("Evaluating model in {}".format(model_dir)) + logger.info("Input args {}".format(sys.argv[1:])) + start_inference() diff --git a/baselines/mixture_embedding_experts/README.md b/baselines/mixture_embedding_experts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5aa7a03f342e5f60876f416e281043c0044ff110 --- /dev/null +++ b/baselines/mixture_embedding_experts/README.md @@ -0,0 +1,14 @@ +# Mixture Embedding Experts (MEE) + +This folder contains the model described in the paper +``` +@article{miech18learning, + title={Learning a {T}ext-{V}ideo {E}mbedding from {I}ncomplete and {H}eterogeneous {D}ata}, + author={Miech, Antoine and Laptev, Ivan and Sivic, Josef}, + journal={arXiv:1804.02516}, + year={2018}, +} +``` + +Disclaimer: This code is implemented by [Jie Lei](http://www.cs.unc.edu/~jielei/) for the TVR dataset, +it does not guarantee the reproducibility of the original authors' results. diff --git a/baselines/mixture_embedding_experts/__init__.py b/baselines/mixture_embedding_experts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/baselines/mixture_embedding_experts/config.py b/baselines/mixture_embedding_experts/config.py new file mode 100644 index 0000000000000000000000000000000000000000..93d96d75651f79ca40b64fa5130ba88d5cfe2455 --- /dev/null +++ b/baselines/mixture_embedding_experts/config.py @@ -0,0 +1,164 @@ +import os +import time +import torch +import argparse + +from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile + + +class BaseOptions(object): + saved_option_filename = "opt.json" + ckpt_filename = "model.ckpt" + tensorboard_log_dir = "tensorboard_log" + train_log_filename = "train.log.txt" + eval_log_filename = "eval.log.txt" + + def __init__(self): + self.parser = argparse.ArgumentParser() + self.initialized = False + self.opt = None + + def initialize(self): + self.initialized = True + self.parser.add_argument("--dset_name", type=str, choices=["tvr"]) + self.parser.add_argument("--eval_split_name", type=str, default="val", + help="should match keys in corpus_path, must set for VCMR") + self.parser.add_argument("--debug", action="store_true", + help="debug (fast) mode, break all loops, do not load all data into memory.") + self.parser.add_argument("--data_ratio", type=float, default=1.0, + help="how many training and eval data to use. 1.0: use all, 0.1: use 10%." + "Use small portion for debug purposes. Note this is different from --debug, " + "which works by breaking the loops, typically they are not used together.") + self.parser.add_argument("--results_root", type=str, default="results") + self.parser.add_argument("--exp_id", type=str, default="res", help="id of the current run") + self.parser.add_argument("--seed", type=int, default=2018, help="random seed") + self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu") + self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job") + self.parser.add_argument("--num_workers", type=int, default=8, + help="num subprocesses used to load the data, 0: use main process") + self.parser.add_argument("--no_core_driver", action="store_true", + help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`") + self.parser.add_argument("--no_pin_memory", action="store_true", + help="Don't use pin_memory=True for dataloader. " + "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4") + + # training config + self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate") + self.parser.add_argument("--wd", type=float, default=0, help="weight decay") + self.parser.add_argument("--n_epoch", type=int, default=50, help="number of epochs to run") + self.parser.add_argument("--max_es_cnt", type=int, default=10, help="number of epochs to early stop") + self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size") + self.parser.add_argument("--eval_query_bsz", type=int, default=1000, + help="mini-batch size at inference, for query") + self.parser.add_argument("--eval_ctx_bsz", type=int, default=200, + help="mini-batch size at inference, for proposals") + self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model") + self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable") + self.parser.add_argument("--margin", type=float, default=0.2, help="margin for hinge loss") + + # Model and Data config + self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions") + self.parser.add_argument("--max_ctx_l", type=int, default=100, + help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100") + + self.parser.add_argument("--train_path", type=str, default=None) + self.parser.add_argument("--eval_path", type=str, default=None, + help="Evaluating during training, for Dev set. If None, will only do training, " + "anet_cap and charades_sta has no dev set, so None") + self.parser.add_argument("--desc_bert_path", type=str, default=None) + self.parser.add_argument("--sub_bert_path", type=str, default=None) + self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature") + self.parser.add_argument("--desc_feat_size", type=int, default=768) + self.parser.add_argument("--ctx_mode", type=str, + choices=["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"], + help="which context to use. a combination of [video, sub, tef]") + self.parser.add_argument("--vid_feat_path", type=str, default="") + self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature") + self.parser.add_argument("--corpus_path", type=str, default=None) + self.parser.add_argument("--no_norm_vfeat", action="store_true", + help="Do not do normalization on video feat, use it when using i3d_resnet concat feat") + self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat") + + self.parser.add_argument("--output_size", type=int, default=256) + + def display_save(self, opt): + args = vars(opt) + # Display settings + print("------------ Options -------------\n{}\n-------------------" + .format({str(k): str(v) for k, v in sorted(args.items())})) + + # Save settings + if not isinstance(self, TestOptions): + option_file_path = os.path.join(opt.results_dir, self.saved_option_filename) # not yaml file indeed + save_json(args, option_file_path, save_pretty=True) + + def parse(self): + if not self.initialized: + self.initialize() + opt = self.parser.parse_args() + + if opt.debug: + opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ]) + opt.no_core_driver = True + opt.num_workers = 0 + + if isinstance(self, TestOptions): + # modify model_dir to absolute path + opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir) + saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename)) + for arg in saved_options: # use saved options to overwrite all BaseOptions args. + if arg not in ["results_root", "num_workers", "nms_thd", "debug", + "eval_split_name", "eval_path", "eval_query_bsz", "eval_ctx_bsz"]: + setattr(opt, arg, saved_options[arg]) + # opt.no_core_driver = True + else: + if opt.exp_id is None: + raise ValueError("--exp_id is required for at a training option!") + + opt.results_dir = os.path.join(opt.results_root, + "-".join([opt.dset_name, opt.ctx_mode, opt.exp_id, + time.strftime("%Y_%m_%d_%H_%M_%S")])) + mkdirp(opt.results_dir) + # save a copy of current code + code_dir = os.path.dirname(os.path.realpath(__file__)) + code_zip_filename = os.path.join(opt.results_dir, "code.zip") + make_zipfile(code_dir, code_zip_filename, + enclosing_dir="code", + exclude_dirs_substring="results", + exclude_dirs=["results", "debug_results", "__pycache__"], + exclude_extensions=[".pyc", ".ipynb", ".swap"]) + + self.display_save(opt) + + if "sub" in opt.ctx_mode: + assert opt.dset_name == "tvr", "sub is only supported for tvr dataset" + + if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d + assert opt.no_norm_vfeat + + opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) + opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) + opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) + opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) + opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") + opt.h5driver = None if opt.no_core_driver else "core" + # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5 + opt.pin_memory = not opt.no_pin_memory + opt.num_workers = 1 if opt.no_core_driver else opt.num_workers + self.opt = opt + return opt + + +class TestOptions(BaseOptions): + """add additional options for evaluating""" + def initialize(self): + BaseOptions.initialize(self) + # also need to specify --eval_split_name + self.parser.add_argument("--eval_id", type=str, help="evaluation id") + self.parser.add_argument("--model_dir", type=str, + help="dir contains the model file, will be converted to absolute path afterwards") + self.parser.add_argument("--tasks", type=str, nargs="+", choices=["VCMR", "SVMR", "VR"], default="SVMR", + help="Which tasks to run." + "VCMR: Video Corpus Moment Retrieval;" + "SVMR: Single Video Moment Retrieval;" + "VR: regular Video Retrieval.") diff --git a/baselines/mixture_embedding_experts/inference.py b/baselines/mixture_embedding_experts/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..b65d36bfa45dbc8400794832a9a8ce501a296f65 --- /dev/null +++ b/baselines/mixture_embedding_experts/inference.py @@ -0,0 +1,234 @@ +import os +import pprint +import time +from tqdm import tqdm, trange + +import torch +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader + +from baselines.mixture_embedding_experts.config import TestOptions +from baselines.mixture_embedding_experts.model import MEE +from baselines.mixture_embedding_experts.retrieval_dataset import \ + retrieval_collate, RetrievalEvalDataset, prepare_batch_inputs +from utils.basic_utils import save_json +from standalone_eval.eval import eval_retrieval + +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def compute_context_embeddings(model, eval_dataset, opt): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated 1000 (videos) * 300 (proposals) * 20 (clips) * 100 (hsz) * 4 / (1024 ** 3) = 2.24 GB + """ + model.eval() + eval_dataset.set_data_mode("context") + context_eval_loader = DataLoader(eval_dataset, + collate_fn=retrieval_collate, + batch_size=opt.eval_ctx_bsz, + num_workers=opt.num_workers, + shuffle=False, + pin_memory=opt.pin_memory) + n_videos = len(eval_dataset) + eval_ctx_bsz = opt.eval_ctx_bsz + global_meta_list = [] # list(dicts) + global_video_embedding, global_sub_embedding = None, None + if model.use_video: + global_video_embedding = torch.empty((n_videos, model.config.output_size), + dtype=torch.float32, device=opt.device) # (N_q, D_o) + if model.use_sub: + global_sub_embedding = torch.empty((n_videos, model.config.output_size), + dtype=torch.float32, device=opt.device) # (N_q, D_o) + for idx, batch in tqdm(enumerate(context_eval_loader), + desc="Computing context embedding for videos", + total=len(context_eval_loader)): + global_meta_list.extend(batch[0]) + model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) + encoded_video, encoded_sub = model.encode_context(model_inputs["video_feat"], model_inputs["sub_feat"]) + if model.use_video: + global_video_embedding[idx * eval_ctx_bsz: (idx + 1) * eval_ctx_bsz] = encoded_video + if model.use_sub: + global_sub_embedding[idx * eval_ctx_bsz: (idx + 1) * eval_ctx_bsz] = encoded_sub + + if opt.debug and idx == 100: + break + return dict(video_meta=global_meta_list, + encoded_video=global_video_embedding, + encoded_sub=global_sub_embedding) + + +def compute_query2ctx_scores(model, eval_dataset, opt, max_n_videos=100): + """Use val set to do evaluation, remember to run with torch.no_grad(). + estimated size 20,000 (query) * 100 (hsz) * 4 / (1024**2) = 7.63 MB + """ + ctx_info = compute_context_embeddings(model, eval_dataset, opt) + + model.eval() + eval_dataset.set_data_mode("query") + query_eval_loader = DataLoader(eval_dataset, + collate_fn=retrieval_collate, + batch_size=opt.eval_query_bsz, + num_workers=opt.num_workers, + shuffle=False, + pin_memory=opt.pin_memory) + global_meta_list = [] # list(dicts) + eval_query_bsz = opt.eval_query_bsz + n_query = eval_query_bsz if opt.debug else len(eval_dataset) + all_scores = torch.empty((n_query, max_n_videos), dtype=torch.float32) # (N_q, max_n_videos) + all_indices = torch.empty((n_query, max_n_videos), dtype=torch.long) # (N_q, max_n_videos) + for idx, batch in tqdm(enumerate(query_eval_loader), + desc="Computing q embedding", + total=len(query_eval_loader)): + global_meta_list.extend(batch[0]) + model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory) + pooled_query = model.query_pooling(model_inputs["query_feat"]) # (Nq, Dt) + conf_matrix = model.get_score_from_pooled_query_with_encoded_ctx( + pooled_query, ctx_info["encoded_video"], ctx_info["encoded_sub"]) # (Nq, Nc) + sorted_values, sorted_indices = \ + torch.topk(conf_matrix, max_n_videos, dim=1, largest=True) # (Nq, max_n_videos) + all_scores[idx * eval_query_bsz: (idx + 1) * eval_query_bsz] = sorted_values.cpu() + all_indices[idx * eval_query_bsz: (idx + 1) * eval_query_bsz] = sorted_indices.cpu() + if opt.debug: + break + return dict( + video_meta=ctx_info["video_meta"], + query_meta=global_meta_list, + q2ctx_scores=all_scores, + q2ctx_indices=all_indices, + video2idx=eval_dataset.video2idx + ) + + +def generate_vr_predictions_from_res(eval_res): + video_meta = eval_res["video_meta"] # list, (Nc, ) + query_meta = eval_res["query_meta"] # list, (Nq, ) + video2idx = eval_res["video2idx"] + q2ctx_scores = eval_res["q2ctx_scores"] # (Nq, max_n_videos) + q2ctx_indices = eval_res["q2ctx_indices"] # (Nq, max_n_videos) + + vr_res = [] + for i, (scores_row, indices_row) in tqdm(enumerate(zip(q2ctx_scores, q2ctx_indices)), + desc="[VR] Loop over queries to generate predictions", + total=len(query_meta)): + cur_vr_redictions = [] + for j, (v_score, v_meta_idx) in enumerate(zip(scores_row, indices_row)): + video_idx = video2idx[video_meta[v_meta_idx]["vid_name"]] + cur_vr_redictions.append([video_idx, 0, 0, float(v_score)]) + cur_query_pred = dict( + query_id=query_meta[i]["query_id"], + desc=query_meta[i]["desc"], + predictions=cur_vr_redictions + ) + vr_res.append(cur_query_pred) + return vr_res + + +def get_submission_top_n(submission, top_n=100): + def get_prediction_top_n(list_dict_predictions, top_n): + top_n_res = [] + for e in list_dict_predictions: + e["predictions"] = e["predictions"][:top_n] + top_n_res.append(e) + return top_n_res + + top_n_submission = dict(video2idx=submission["video2idx"], ) + for k in submission: + if k != "video2idx": + top_n_submission[k] = get_prediction_top_n(submission[k], top_n) + return top_n_submission + + +def eval_epoch(model, eval_dataset, opt, save_submission_filename, + tasks=("SVMR",), max_before_nms=1000, max_after_nms=100): + model.eval() + logger.info("Computing scores") + logger.info("Start timing") + # times = [] + # for _ in range(3): + # st_time = time.time() + eval_res = compute_query2ctx_scores(model, eval_dataset, opt) + logger.info("Generating predictions from scores") + eval_submission_raw = dict(video2idx=eval_res["video2idx"]) + eval_submission_raw["VR"] = generate_vr_predictions_from_res(eval_res) + # times += [time.time() - st_time] + # times = torch.FloatTensor(times) + IOU_THDS = (0.5, 0.7) + + logger.info("Saving/Evaluating before nms results") + submission_path = os.path.join(opt.results_dir, save_submission_filename) + eval_submission = get_submission_top_n(eval_submission_raw, top_n=100) + save_json(eval_submission, submission_path) + + metrics = eval_retrieval(eval_submission, eval_dataset.query_data, + iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug) + # metrics["time_avg"] = float(times.mean()) + # metrics["time_std"] = float(times.std()) + save_metrics_path = submission_path.replace(".json", "_metrics.json") + save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False) + latest_file_paths = [submission_path, save_metrics_path] + + metrics_nms = None + return metrics, metrics_nms, latest_file_paths + + +def setup_model(opt): + """Load model from checkpoint and move to specified device""" + checkpoint = torch.load(opt.ckpt_filepath) + model = MEE(checkpoint["model_cfg"]) + model.load_state_dict(checkpoint["model"]) + logger.info("Loaded model saved at epoch {} from checkpoint: {}" + .format(checkpoint["epoch"], opt.ckpt_filepath)) + + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + return model + + +def start_inference(): + logger.info("Setup config, data and model...") + opt = TestOptions().parse() + cudnn.benchmark = False + cudnn.deterministic = True + + assert opt.eval_path is not None + eval_dataset = RetrievalEvalDataset( + dset_name=opt.dset_name, + eval_split_name=opt.eval_split_name, # should only be val set + data_path=opt.eval_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + corpus_path=opt.corpus_path, + vid_feat_path_or_handler=opt.vid_feat_path, + ctx_mode=opt.ctx_mode, + data_mode="query", + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + ) + + model = setup_model(opt) + save_submission_filename = \ + "inference_{}_{}_{}_predictions_{}.json".format( + opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks)) + logger.info("Starting inference...") + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, eval_dataset, opt, save_submission_filename, tasks=opt.tasks) + logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + +if __name__ == '__main__': + start_inference() diff --git a/baselines/mixture_embedding_experts/model.py b/baselines/mixture_embedding_experts/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ee00aa564f675813be1528045235f90115480d99 --- /dev/null +++ b/baselines/mixture_embedding_experts/model.py @@ -0,0 +1,84 @@ +import torch +import torch.nn as nn +from baselines.mixture_embedding_experts.model_components import NetVLAD, MaxMarginRankingLoss, GatedEmbeddingUnit +from easydict import EasyDict as edict + +mee_base_cfg = edict( + ctx_mode="video", + text_input_size=768, + vid_input_size=1024, + output_size=256, + margin=0.2 +) + + +class MEE(nn.Module): + def __init__(self, config): + super(MEE, self).__init__() + self.config = config + self.use_video = "video" in config.ctx_mode + self.use_sub = "sub" in config.ctx_mode + + self.query_pooling = NetVLAD(feature_size=config.text_input_size, cluster_size=2) + + if self.use_sub: + self.sub_query_gu = GatedEmbeddingUnit(input_dimension=self.query_pooling.out_dim, + output_dimension=config.output_size) + self.sub_gu = GatedEmbeddingUnit(input_dimension=config.text_input_size, + output_dimension=config.output_size) + + if self.use_video: + self.video_query_gu = GatedEmbeddingUnit(input_dimension=self.query_pooling.out_dim, + output_dimension=config.output_size) + self.video_gu = GatedEmbeddingUnit(input_dimension=config.vid_input_size, + output_dimension=config.output_size) + + if self.use_video and self.use_sub: + self.moe_fc = nn.Linear(self.query_pooling.out_dim, 2) # weights + + self.max_margin_loss = MaxMarginRankingLoss(margin=config.margin) + + def forward(self, query_feat, query_mask, video_feat, sub_feat): + """ + Args: + query_feat: (N, L, D_q) + query_mask: (N, L) + video_feat: (N, Dv) + sub_feat: (N, Dt) + """ + pooled_query = self.query_pooling(query_feat) # (N, Dt) + encoded_video, encoded_sub = self.encode_context(video_feat, sub_feat) + confusion_matrix = self.get_score_from_pooled_query_with_encoded_ctx(pooled_query, encoded_video, encoded_sub) + return self.max_margin_loss(confusion_matrix) + + def encode_context(self, video_feat, sub_feat): + """(N, D)""" + encoded_video = self.video_gu(video_feat) if self.use_video else None + encoded_sub = self.sub_gu(sub_feat) if self.use_sub else None + return encoded_video, encoded_sub + + def compute_single_stream_scores_with_encoded_ctx(self, pooled_query, encoded_ctx, module_name="video"): + encoded_query = getattr(self, module_name+"_query_gu")(pooled_query) # (N, D) + return torch.einsum("md,nd->mn", encoded_query, encoded_ctx) # (N, N) + + def get_score_from_pooled_query_with_encoded_ctx(self, pooled_query, encoded_video, encoded_sub): + """Nq may not equal to Nc + Args: + pooled_query: (Nq, Dt) + encoded_video: (Nc, Dc) + encoded_sub: (Nc, Dc) + """ + + video_confusion_matrix = self.compute_single_stream_scores_with_encoded_ctx( + pooled_query, encoded_video, module_name="video") if self.use_video else 0 + sub_confusion_matrix = self.compute_single_stream_scores_with_encoded_ctx( + pooled_query, encoded_sub, module_name="sub") if self.use_sub else 0 + + if self.use_video and self.use_sub: + stream_weights = self.moe_fc(pooled_query) # (N, 2) + confusion_matrix = \ + stream_weights[:, 0:1] * video_confusion_matrix + stream_weights[:, 1:2] * sub_confusion_matrix + else: + confusion_matrix = video_confusion_matrix + sub_confusion_matrix + return confusion_matrix # (Nq, Nc) + diff --git a/baselines/mixture_embedding_experts/model_components.py b/baselines/mixture_embedding_experts/model_components.py new file mode 100644 index 0000000000000000000000000000000000000000..d5edefe523e8cf7274964009817337283d546bef --- /dev/null +++ b/baselines/mixture_embedding_experts/model_components.py @@ -0,0 +1,103 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class GatedEmbeddingUnit(nn.Module): + def __init__(self, input_dimension, output_dimension): + super(GatedEmbeddingUnit, self).__init__() + + self.fc = nn.Linear(input_dimension, output_dimension) + self.cg = ContextGating(output_dimension) + + def forward(self, x): + x = self.fc(x) + x = self.cg(x) + x = F.normalize(x) + return x + + +class ContextGating(nn.Module): + def __init__(self, dimension, add_batch_norm=True): + super(ContextGating, self).__init__() + self.fc = nn.Linear(dimension, dimension) + self.add_batch_norm = add_batch_norm + self.batch_norm = nn.BatchNorm1d(dimension) + + def forward(self, x): + x1 = self.fc(x) + + if self.add_batch_norm: + x1 = self.batch_norm(x1) + + x = torch.cat((x, x1), 1) + return F.glu(x, 1) + + +class MaxMarginRankingLoss(nn.Module): + def __init__(self, margin=1): + super(MaxMarginRankingLoss, self).__init__() + self.margin = margin + + def forward(self, x): + n = x.size()[0] + + x1 = torch.diag(x) + x1 = x1.unsqueeze(1) + x1 = x1.expand(n, n) + x1 = x1.contiguous().view(-1, 1) + x1 = torch.cat((x1, x1), 0) + + x2 = x.view(-1, 1) + x3 = x.transpose(0, 1).contiguous().view(-1, 1) + + x2 = torch.cat((x2, x3), 0) + + max_margin = F.relu(self.margin - (x1 - x2)) + return max_margin.mean() + + +class NetVLAD(nn.Module): + def __init__(self, cluster_size, feature_size, add_batch_norm=True): + super(NetVLAD, self).__init__() + self.feature_size = feature_size + self.cluster_size = cluster_size + self.clusters = nn.Parameter((1 / math.sqrt(feature_size)) + * torch.randn(feature_size, cluster_size)) + self.clusters2 = nn.Parameter((1 / math.sqrt(feature_size)) + * torch.randn(1, feature_size, cluster_size)) + + self.add_batch_norm = add_batch_norm + self.batch_norm = nn.BatchNorm1d(cluster_size) + self.out_dim = cluster_size * feature_size + + def forward(self, x): + max_sample = x.size()[1] + x = x.view(-1, self.feature_size) + assignment = torch.matmul(x, self.clusters) + + if self.add_batch_norm: + assignment = self.batch_norm(assignment) + + assignment = F.softmax(assignment, dim=1) + assignment = assignment.view(-1, max_sample, self.cluster_size) + + a_sum = torch.sum(assignment, -2, keepdim=True) + a = a_sum * self.clusters2 + + assignment = assignment.transpose(1, 2) + + x = x.view(-1, max_sample, self.feature_size) + vlad = torch.matmul(assignment, x) + vlad = vlad.transpose(1, 2) + vlad = vlad - a + + # L2 intra norm + vlad = F.normalize(vlad) + + # flattening + L2 norm + vlad = vlad.view(-1, self.cluster_size * self.feature_size) + vlad = F.normalize(vlad) + + return vlad diff --git a/baselines/mixture_embedding_experts/retrieval_dataset.py b/baselines/mixture_embedding_experts/retrieval_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4e775f7d2aaa8594c1a9902070d23337ec760b74 --- /dev/null +++ b/baselines/mixture_embedding_experts/retrieval_dataset.py @@ -0,0 +1,283 @@ +""" +Dataset for clip model +""" +import logging +import torch +from torch.utils.data import Dataset +import numpy as np +import h5py +from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array, flat_list_of_lists, merge_dicts +from utils.tensor_utils import pad_sequences_1d + +logger = logging.getLogger(__name__) + + +class RetrievalDataset(Dataset): + """ + Args: + dset_name, str, ["tvr"] + ctx_mode: str, + Return: + a dict: { + "meta": { + "query_id": int, + "desc": str, + "vid_name": str, + "duration": float, + "ts": [st (float), ed (float)], seconds, ground_truth timestamps + } + "model_inputs": { + "query_feat": torch.tensor, (L, D_q) + "video_feat": torch.tensor, (n_clip_in_moment, D_video) + "sub_feat": torch.tensor, (n_clip_in_moment, D_sub) + "st_ed_indices": torch.LongTensor, (2, ) + } + } + """ + def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, + vid_feat_path_or_handler, max_desc_len, max_ctx_len, ctx_mode="video", + normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0): + self.dset_name = dset_name + self.data_path = data_path + self.data_ratio = data_ratio + self.max_desc_len = max_desc_len + self.max_ctx_len = max_ctx_len + + self.desc_bert_path_or_handler = desc_bert_path_or_handler + self.sub_bert_path_or_handler = sub_bert_path_or_handler + self.vid_feat_path_or_handler = vid_feat_path_or_handler + self.ctx_mode = ctx_mode + + # prepare desc data + self.data = load_jsonl(data_path) + if self.data_ratio != 1: + n_examples = int(len(self.data) * data_ratio) + self.data = self.data[:n_examples] + logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) + + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if self.use_video: + if isinstance(vid_feat_path_or_handler, h5py.File): + self.vid_feat_h5 = vid_feat_path_or_handler + else: # str path + self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) + + if isinstance(desc_bert_path_or_handler, h5py.File): + self.desc_bert_h5 = desc_bert_path_or_handler + else: + self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) + + if self.use_sub: + if isinstance(sub_bert_path_or_handler, h5py.File): + self.sub_bert_h5 = sub_bert_path_or_handler + else: # str path + self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) + + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + raw_data = self.data[index] + + # initialize with basic data + meta = dict( + query_id=raw_data["query_id"], + desc=raw_data["desc"], + vid_name=raw_data["vid_name"], + duration=raw_data["duration"], + ) + model_inputs = dict() + model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"]) + + ctx_l = 0 + if self.use_video: + video_feat = np.mean(self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len], axis=0) # (D, ) + if self.normalize_vfeat: + video_feat = l2_normalize_np_array(video_feat) + model_inputs["video_feat"] = torch.from_numpy(video_feat) + else: + model_inputs["video_feat"] = torch.zeros(2) + + if self.use_sub: # no need for ctx feature, as the features are already contextulized + sub_feat = np.mean(self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len], axis=0) # (N_clips, D_t) + if self.normalize_tfeat: + sub_feat = l2_normalize_np_array(sub_feat) + model_inputs["sub_feat"] = torch.from_numpy(sub_feat) + else: + model_inputs["sub_feat"] = torch.zeros(2) + return dict(meta=meta, model_inputs=model_inputs) + + def get_query_feat_by_query_id(self, query_id): + query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + return torch.from_numpy(query_feat) + + +class RetrievalEvalDataset(Dataset): + """ + init_data_mode: `video_query` or `video_only` or `query_only`, + it indicates which data to load when initialize the Dataset object. + data_mode: `context` or `query`, it indicates which data to return for self.__get_item__() + desc_bert_path_or_handler: h5py.File object or str path + vid_feat_path_or_handler: h5py.File object or str path + eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with + max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals. + load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval. + data_ratio: percentage of query data to use. + """ + def __init__(self, dset_name, eval_split_name, data_path=None, + desc_bert_path_or_handler=None, max_desc_len=None, max_ctx_len=None, + sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, + corpus_path=None, ctx_mode="video", data_mode="context", + h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True): + self.dset_name = dset_name + self.eval_split_name = eval_split_name + self.ctx_mode = ctx_mode + self.load_gt_video = False + self.data_ratio = data_ratio # only affect query data + self.normalize_vfeat = normalize_vfeat + self.normalize_tfeat = normalize_tfeat + + self.data_mode = None + self.set_data_mode(data_mode) + + self.max_desc_len = max_desc_len + self.max_ctx_len = max_ctx_len + self.data_path = data_path + self.query_data = load_jsonl(data_path) + if data_ratio != 1: + n_examples = int(len(self.query_data) * data_ratio) + self.query_data = self.query_data[:n_examples] + logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) + if isinstance(desc_bert_path_or_handler, h5py.File): + self.desc_bert_h5 = desc_bert_path_or_handler + else: + self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) + + video_data = load_json(corpus_path)[self.eval_split_name] + self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] + self.video2idx = {k: v[1] for k, v in video_data.items()} + + self.use_video = "video" in self.ctx_mode + self.use_sub = "sub" in self.ctx_mode + self.use_tef = "tef" in self.ctx_mode + + if self.use_video: + if isinstance(vid_feat_path_or_handler, h5py.File): + self.vid_feat_h5 = vid_feat_path_or_handler + else: # str path + self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) + + if self.use_sub: + if isinstance(sub_bert_path_or_handler, h5py.File): + self.sub_bert_h5 = sub_bert_path_or_handler + else: # str path + self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) + + def set_data_mode(self, data_mode): + """context or query""" + assert data_mode in ["context", "query"] + self.data_mode = data_mode + + def load_gt_vid_name_for_query(self, load_gt_video): + """load_gt_video: bool, affect the returned value of self._get_item_query""" + assert "vid_name" in self.query_data[0] + self.load_gt_video = load_gt_video + + def __len__(self): + if self.data_mode == "context": + return len(self.video_data) + else: + return len(self.query_data) + + def __getitem__(self, index): + if self.data_mode == "context": + return self._get_item_context(index) + else: + return self._get_item_query(index) + + def get_query_feat_by_query_id(self, query_id): + query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len] + if self.normalize_tfeat: + query_feat = l2_normalize_np_array(query_feat) + return torch.from_numpy(query_feat) + + def _get_item_query(self, index): + """Need to batch""" + raw_data = self.query_data[index] + + meta = dict( + query_id=raw_data["query_id"], + desc=raw_data["desc"], + vid_name=raw_data["vid_name"] if self.load_gt_video else None + ) + + model_inputs = dict() + model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"]) + return dict(meta=meta, model_inputs=model_inputs) + + def _get_item_context(self, index): + """No need to batch, since it has already been batched here""" + raw_data = self.video_data[index] + + # initialize with basic data + meta = dict( + vid_name=raw_data["vid_name"], + duration=raw_data["duration"], + ) + + model_inputs = dict() + + if self.use_video: + video_feat = np.mean(self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len], axis=0) # (1, D) + if self.normalize_vfeat: + video_feat = l2_normalize_np_array(video_feat) + model_inputs["video_feat"] = torch.from_numpy(video_feat) + else: + model_inputs["video_feat"] = torch.zeros(2) + + if self.use_sub: # no need for ctx feature, as the features are already contextulized + sub_feat = np.mean(self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len], axis=0) + if self.normalize_tfeat: + sub_feat = l2_normalize_np_array(sub_feat) + model_inputs["sub_feat"] = torch.from_numpy(sub_feat) + else: + model_inputs["sub_feat"] = torch.zeros(2) + return dict(meta=meta, model_inputs=model_inputs) + + +def retrieval_collate(batch): + batch_meta = [e["meta"] for e in batch] # seems no need to collate ? + + model_inputs_keys = batch[0]["model_inputs"].keys() + batched_data = dict() + for k in model_inputs_keys: + if k == "query_feat": + batched_data[k] = pad_sequences_1d( + [e["model_inputs"][k] for e in batch], dtype=torch.float32, fixed_length=None) + elif "feat" in k: + batched_data[k] = torch.stack([e["model_inputs"][k] for e in batch]) + return batch_meta, batched_data + + +def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False): + model_inputs = {} + for k, v in batched_model_inputs.items(): + if k == "query_feat": + model_inputs[k] = v[0].to(device, non_blocking=non_blocking) + model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking) + else: + model_inputs[k] = v.to(device, non_blocking=non_blocking) + return model_inputs + + +if __name__ == '__main__': + from baselines.crossmodal_moment_localization.config import BaseOptions + options = BaseOptions().parse() diff --git a/baselines/mixture_embedding_experts/scripts/inference.sh b/baselines/mixture_embedding_experts/scripts/inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..fca00159bae71e178e30b3ff7040e97329670469 --- /dev/null +++ b/baselines/mixture_embedding_experts/scripts/inference.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/mixture_embedding_experts/scripts/inference.sh ANY_OTHER_PYTHON_ARGS +model_dir=$1 +eval_split_name=$2 # [val ] +eval_path=data/tvr_${eval_split_name}_release.jsonl +tasks=() +tasks+=(VR) +echo "tasks ${tasks[@]}" +python baselines/mixture_embedding_experts/inference.py \ +--model_dir ${model_dir} \ +--tasks ${tasks[@]} \ +--eval_split_name ${eval_split_name} \ +--eval_path ${eval_path} \ +${@:3} diff --git a/baselines/mixture_embedding_experts/scripts/train.sh b/baselines/mixture_embedding_experts/scripts/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..53cd2504ee09f5f2f1a45f1dfa764b64416d2eb7 --- /dev/null +++ b/baselines/mixture_embedding_experts/scripts/train.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# run at project root dir +# Usage: +# bash baselines/clip_alignment_with_language/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS +dset_name=$1 # see case below +ctx_mode=$2 # ["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"] +vid_feat_type=$3 # [resnet, i3d, resnet_i3d, none] , none for subtitles only models +feature_root=data/tvr_feature_release +results_root=baselines/mixture_embedding_experts/results +vid_feat_size=2048 +extra_args=() + +if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + if [[ ${dset_name} != "tvr" ]]; then + echo "The use of subtitles is only supported in tvr." + exit 1 + fi +fi + + +case ${dset_name} in + tvr) + train_path=data/tvr_train_release.jsonl + corpus_path=data/tvr_video2dur_idx.json + desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5 + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5 + clip_length=1.5 + eval_split_name=val + nms_thd=-1 + extra_args+=(--eval_path) + extra_args+=(data/tvr_val_release.jsonl) + + if [[ ${vid_feat_type} == "i3d" ]]; then + echo "Using I3D feature with shape 1024" + vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 + vid_feat_size=1024 + elif [[ ${vid_feat_type} == "resnet" ]]; then + echo "Using ResNet feature with shape 2048" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5 + vid_feat_size=2048 + elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then + echo "Using concatenated ResNet and I3D feature with shape 2048+1024" + vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5 + vid_feat_size=3072 + extra_args+=(--no_norm_vfeat) # since they are already normalized. + fi + + if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then + echo "Running with sub." + desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5 # overwrite + sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5 + sub_feat_size=768 + extra_args+=(--sub_feat_size) + extra_args+=(${sub_feat_size}) + extra_args+=(--sub_bert_path) + extra_args+=(${sub_bert_path}) + fi + ;; + *) + echo -n "Unknown argument" + ;; +esac + +echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]" +echo "Extra args ${extra_args[@]}" +python baselines/mixture_embedding_experts/train.py \ +--dset_name=${dset_name} \ +--eval_split_name=${eval_split_name} \ +--results_root=${results_root} \ +--train_path=${train_path} \ +--desc_bert_path=${desc_bert_path} \ +--corpus_path=${corpus_path} \ +--vid_feat_path=${vid_feat_path} \ +--vid_feat_size=${vid_feat_size} \ +--ctx_mode=${ctx_mode} \ +${extra_args[@]} \ +${@:4} diff --git a/baselines/mixture_embedding_experts/train.py b/baselines/mixture_embedding_experts/train.py new file mode 100644 index 0000000000000000000000000000000000000000..303e59c4e458235df01f2c3ad9969ad3a88e5dc1 --- /dev/null +++ b/baselines/mixture_embedding_experts/train.py @@ -0,0 +1,280 @@ +import os +import time +import json +import pprint +import random +import numpy as np +from collections import OrderedDict +from easydict import EasyDict as EDict +from tqdm import tqdm, trange + +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from baselines.mixture_embedding_experts.config import BaseOptions +from baselines.mixture_embedding_experts.model import MEE +from baselines.mixture_embedding_experts.retrieval_dataset import \ + RetrievalDataset, retrieval_collate, RetrievalEvalDataset, prepare_batch_inputs +from baselines.mixture_embedding_experts.inference import eval_epoch, start_inference +from utils.basic_utils import save_jsonl, save_json, AverageMeter +from utils.model_utils import count_parameters + + +import logging +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def set_seed(seed, use_cuda=True): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if use_cuda: + torch.cuda.manual_seed_all(seed) + + +def train_epoch(model, train_loader, optimizer, opt, epoch_i): + model.train() + + # init meters + dataloading_time = AverageMeter() + prepare_inputs_time = AverageMeter() + model_forward_time = AverageMeter() + model_backward_time = AverageMeter() + loss_meter = AverageMeter() + + num_training_examples = len(train_loader) + timer_dataloading = time.time() + for batch_idx, batch in tqdm(enumerate(train_loader), + desc="Training Iteration", + total=num_training_examples): + dataloading_time.update(time.time() - timer_dataloading) + + # continue + timer_start = time.time() + model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory) + prepare_inputs_time.update(time.time() - timer_start) + timer_start = time.time() + loss = model(**model_inputs) + model_forward_time.update(time.time() - timer_start) + timer_start = time.time() + optimizer.zero_grad() + loss.backward() + if opt.grad_clip != -1: + nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) + optimizer.step() + model_backward_time.update(time.time() - timer_start) + + global_step = epoch_i * num_training_examples + batch_idx + opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step) + opt.writer.add_scalar("Train/Loss", float(loss), global_step) + loss_meter.update(float(loss)) + + timer_dataloading = time.time() + if opt.debug and batch_idx == 3: + break + to_write = opt.train_log_txt_formatter.format( + time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), + epoch=epoch_i, + loss_str=str(loss_meter.avg)) + with open(opt.train_log_filepath, "a") as f: + f.write(to_write) + print("Epoch time stats:") + print("dataloading_time: max {dataloading_time.max} " + "min {dataloading_time.min} avg {dataloading_time.avg}\n" + "prepare_inputs_time: max {prepare_inputs_time.max} " + "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n" + "model_forward_time: max {model_forward_time.max} " + "min {model_forward_time.min} avg {model_forward_time.avg}\n" + "model_backward_time: max {model_backward_time.max} " + "min {model_backward_time.min} avg {model_backward_time.avg}\n" + "".format(dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time, + model_forward_time=model_forward_time, model_backward_time=model_backward_time)) + + +def train(model, train_dataset, val_dataset, opt): + # Prepare optimizer + if opt.device.type == "cuda": + logger.info("CUDA enabled.") + model.to(opt.device) + if len(opt.device_ids) > 1: + logger.info("Use multi GPU", opt.device_ids) + model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU + + optimizer = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=opt.lr) + # reduce the lr by 0.1 every 30 epochs + scheduler = torch.optim.lr_scheduler.ExponentialLR( + optimizer, + gamma=0.95 + ) + + train_loader = DataLoader(train_dataset, + collate_fn=retrieval_collate, + batch_size=opt.bsz, + num_workers=opt.num_workers, + shuffle=True, + pin_memory=opt.pin_memory) + + prev_best_score = 0. + es_cnt = 0 + start_epoch = -1 if opt.eval_untrained else 0 + eval_tasks_at_training = ["VR"] + save_submission_filename = \ + "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, "_".join(eval_tasks_at_training)) + for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"): + if epoch_i > -1: + with torch.autograd.detect_anomaly(): + train_epoch(model, train_loader, optimizer, opt, epoch_i) + global_step = (epoch_i + 1) * len(train_loader) + scheduler.step() + if opt.eval_path is not None: + with torch.no_grad(): + metrics_no_nms, metrics_nms, latest_file_paths = \ + eval_epoch(model, val_dataset, opt, save_submission_filename, tasks=eval_tasks_at_training) + logger.info("metrics_no_nms {}".format( + pprint.pformat(rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4))) + logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4))) + + to_write = opt.eval_log_txt_formatter.format( + time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), + epoch=epoch_i, + eval_metrics_str=json.dumps(metrics_no_nms)) + with open(opt.eval_log_filepath, "a") as f: + f.write(to_write) + + # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms + metrics = metrics_no_nms + # early stop/ log / save model + for task_type, task_metrics in metrics.items(): + for iou_thd in [0.5, 0.7]: + opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd), + {k: v for k, v in task_metrics.items() if str(iou_thd) in k}, + global_step) + + # use the most strict metric available + if metrics["VR"]["r1"] > prev_best_score: + es_cnt = 0 + prev_best_score = metrics["VR"]["r1"] + + checkpoint = { + "model": model.state_dict(), + "model_cfg": model.config, + "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + + best_file_paths = [e.replace("latest", "best") for e in latest_file_paths] + for src, tgt in zip(latest_file_paths, best_file_paths): + os.renames(src, tgt) + logger.info("The checkpoint file has been updated.") + else: + es_cnt += 1 + if es_cnt > opt.max_es_cnt: # early stop + with open(opt.train_log_filepath, "a") as f: + f.write("Early Stop at epoch {}".format(epoch_i)) + logger.info("Early stop at {} with VR r1 {}".format(epoch_i, prev_best_score)) + break + else: + checkpoint = { + "model": model.state_dict(), + "model_cfg": model.config, + "epoch": epoch_i} + torch.save(checkpoint, opt.ckpt_filepath) + + if opt.debug: + break + + opt.writer.close() + + +def rm_key_from_odict(odict_obj, rm_suffix): + """remove key entry from the OrderedDict""" + return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k]) + + +def start_training(): + logger.info("Setup config, data and model...") + opt = BaseOptions().parse() + set_seed(opt.seed) + if opt.debug: # keep the model run deterministically + # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config. + # Enable this only when input size is fixed. + cudnn.benchmark = False + cudnn.deterministic = True + + opt.writer = SummaryWriter(opt.tensorboard_log_dir) + opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" + opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" + + train_dataset = RetrievalDataset( + dset_name=opt.dset_name, + data_path=opt.train_path, + desc_bert_path_or_handler=opt.desc_bert_path, + sub_bert_path_or_handler=opt.sub_bert_path, + vid_feat_path_or_handler=opt.vid_feat_path, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + ctx_mode=opt.ctx_mode, + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + ) + + if opt.eval_path is not None: + eval_dataset = RetrievalEvalDataset( + dset_name=opt.dset_name, + eval_split_name=opt.eval_split_name, # should only be val set + data_path=opt.eval_path, + desc_bert_path_or_handler=train_dataset.desc_bert_h5, + sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, + max_desc_len=opt.max_desc_l, + max_ctx_len=opt.max_ctx_l, + corpus_path=opt.corpus_path, + vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, + ctx_mode=opt.ctx_mode, + data_mode="query", + h5driver=opt.h5driver, + data_ratio=opt.data_ratio, + normalize_vfeat=not opt.no_norm_vfeat, + normalize_tfeat=not opt.no_norm_tfeat, + ) + else: + eval_dataset = None + + model_config = EDict( + ctx_mode=opt.ctx_mode, + text_input_size=opt.sub_feat_size, + vid_input_size=opt.vid_feat_size, # + output_size=opt.output_size, + margin=opt.margin, # margin for ranking loss + ) + logger.info("model_config {}".format(model_config)) + model = MEE(model_config) + count_parameters(model) + logger.info("Start Training...") + train(model, train_dataset, eval_dataset, opt) + return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug + + +if __name__ == '__main__': + model_dir, eval_split_name, eval_path, debug = start_training() + if not debug: + model_dir = model_dir.split(os.sep)[-1] + tasks = ["VR"] + input_args = ["--model_dir", model_dir, + "--eval_split_name", eval_split_name, + "--eval_path", eval_path, + "--tasks"] + tasks + + import sys + sys.argv[1:] = input_args + logger.info("\n\n\nFINISHED TRAINING!!!") + logger.info("Evaluating model in {}".format(model_dir)) + start_inference() diff --git a/baselines/profiling/README.md b/baselines/profiling/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6a15400bc8123c78a740e0d19f8bd81041395914 --- /dev/null +++ b/baselines/profiling/README.md @@ -0,0 +1,5 @@ +# Profiling + +### Additional Requirements: +- [FAISS](https://github.com/facebookresearch/faiss/) for nearest neighbor search, +install it by `pip install faiss-gpu==1.6.1`. diff --git a/baselines/profiling/profile_main.py b/baselines/profiling/profile_main.py new file mode 100644 index 0000000000000000000000000000000000000000..76577690697937632ce3d2513bc5e45ebfbe48a6 --- /dev/null +++ b/baselines/profiling/profile_main.py @@ -0,0 +1,485 @@ +""" +Profile the time needed for retrieval. +We consider retrieval in a corpus of 1M videos, 1K videos are added, 10K queries are retrieved. +Calculate the time needed for adding 1K videos, and performing retrieval for 10K queries. + +1, Data Loading time is ignored, consider it is hidden by computation time. +2, Sort time is ignored, since it is the similar among the methods. +""" +import os +import time +import torch +import torch.nn as nn +import torch.nn.functional as F +import pprint +from tqdm import tqdm, trange +from baselines.crossmodal_moment_localization.model_xml import XML, xml_base_config +from baselines.mixture_embedding_experts.model import MEE, mee_base_cfg +from baselines.clip_alignment_with_language.model import CALWithSub, cal_base_cfg +from baselines.excl.model import EXCL, excl_base_cfg +from utils.basic_utils import save_json + + +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + +def mask_logits(target, mask): + return target * mask + (1 - mask) * (-1e10) + + +class ProfileBase(object): + N_NewQuery = 1e4 + N_NewVideo = 1e3 + N_Videos = 1e6 + AvgVideoLength = 100 + ClipLength = 5 + AvgClipPerVideo = int(AvgVideoLength / ClipLength) # max_ctx_l + AvgWordInQuery = 15 + # estimated by + # scales=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], => max_proposal = 14 + AvgProposalPerVideo = 170 + MaxClipPerProposal = 14 # pad to this length + AvgClipPerProposal = 7 # 6.88 + + VideoFeatureDim = 3074 # 1024 + 2048 + 2 (TEF) + SubFeatureDim = 770 + QueryFeatureDim = 768 + + HiddenSize = 256 + N_Runs = 5 # Get the average time + + def __init__(self, device=torch.device("cuda:0"), ctx_batch_size=400, query_batch_size=100): + self.device = device + self.ctx_batch_size = ctx_batch_size + self.query_batch_size = query_batch_size + self.model_config = self.get_model_config() + print(self.model_config) + self.model = self.get_model() + + def get_model(self): + return None + + def get_model_config(self): + return None + + def set_ctx_batch_size(self, batch_size): + self.ctx_batch_size = batch_size + + def set_query_batch_size(self, batch_size): + self.query_batch_size = batch_size + + def cast_dict_inputs_to_device(self, dict_inputs, device): + return {k: v.to(device) for k, v in dict_inputs.items()} + + def get_fake_ctx_raw_input_st_ed(self, no_tef=False): + return dict( + video_feat=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l, + self.VideoFeatureDim - 2*no_tef), + sub_feat=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l, self.SubFeatureDim - 2*no_tef), + ctx_mask=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l), + ) + + def get_fake_raw_query(self): + return dict( + query_feat=torch.FloatTensor(self.query_batch_size, self.AvgWordInQuery, self.QueryFeatureDim), + query_mask=torch.ones(self.query_batch_size, self.AvgWordInQuery) + ) + + +""" +from baselines.profiling.profile_main import ProfileXML +profile_xml = ProfileXML(ctx_batch_size=400, query_batch_size=100) +profile_xml.get_ctx_encoding_time() +""" + + +class ProfileXML(ProfileBase): + def get_model_config(self): + xml_base_config["ctx_mode"] = "video_sub_tef" + xml_base_config["merge_two_stream"] = True + xml_base_config["cross_att"] = True + xml_base_config["max_ctx_l"] = self.AvgClipPerVideo + xml_base_config["visual_input_size"] = self.VideoFeatureDim + xml_base_config["query_input_size"] = self.QueryFeatureDim + xml_base_config["sub_input_size"] = self.SubFeatureDim + xml_base_config["hidden_size"] = self.HiddenSize + return xml_base_config + + def get_model(self): + model = XML(self.model_config) + model.to(self.device) + model.eval() + return model + + def get_fake_encoded_ctx(self): + return dict( + ctx_feat=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l, self.HiddenSize), + ctx_mask=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l), + ) + + def get_fake_encoded_query(self): + return dict(query_feat=torch.FloatTensor(self.ctx_batch_size, self.HiddenSize)) + + def _get_ctx_encoding_time(self, video_feat, sub_feat, ctx_mask): + """Considered two modalities""" + torch.cuda.synchronize() + st_time = time.time() + self.model.cross_encode_context(video_feat, ctx_mask, sub_feat, ctx_mask) + torch.cuda.synchronize() + return time.time() - st_time + + def get_ctx_encoding_time(self): + with torch.no_grad(): + fake_ctx_inputs = self.cast_dict_inputs_to_device(self.get_fake_ctx_raw_input_st_ed(), self.device) + raw_video = fake_ctx_inputs["video_feat"] + raw_sub = fake_ctx_inputs["sub_feat"] + ctx_mask = fake_ctx_inputs["ctx_mask"] + times = [] + for _ in trange(self.N_Runs): + times += [self._get_ctx_encoding_time(raw_video, raw_sub, ctx_mask)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + def _get_query_encoding_time(self, raw_query, query_mask): + """Considered two modalities""" + torch.cuda.synchronize() + st_time = time.time() + encoded_query = self.model.encode_input(raw_query, query_mask, + self.model.query_input_proj, + self.model.query_encoder, + self.model.query_pos_embed) # (N, Lq, D) + # video level + video_query, sub_query = \ + self.model.get_modularized_queries(encoded_query, query_mask, return_modular_att=False) + # st ed + video_query = self.model.video_query_linear(video_query) + sub_query = self.model.sub_query_linear(sub_query) + torch.cuda.synchronize() + return time.time() - st_time + + def get_query_encoding_time(self): + with torch.no_grad(): + query_inputs = self.cast_dict_inputs_to_device(self.get_fake_raw_query(), self.device) + raw_query = query_inputs["query_feat"] + query_mask = query_inputs["query_mask"] + times = [] + for _ in trange(self.N_Runs): + times += [self._get_query_encoding_time(raw_query, query_mask)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + def _get_retrieval_time(self, encoded_video_query, encoded_video, ctx_mask): + """Consider the queries are encoded, Calculate in a single modality then multiply by 2.""" + torch.cuda.synchronize() + st_time = time.time() + self.model.get_video_level_scores(encoded_video_query, encoded_video, ctx_mask) + torch.cuda.synchronize() + return (time.time() - st_time) * 2 + + def get_retrieval_time(self): + with torch.no_grad(): + encoded_query = self.cast_dict_inputs_to_device(self.get_fake_encoded_query(), self.device)["query_feat"] + fake_ctx_inputs = self.cast_dict_inputs_to_device(self.get_fake_encoded_ctx(), self.device) + encoded_ctx = fake_ctx_inputs["ctx_feat"] + ctx_mask = fake_ctx_inputs["ctx_mask"] + times = [] + for _ in trange(self.N_Runs): + times += [self._get_retrieval_time(encoded_query, encoded_ctx, ctx_mask)] + times = torch.FloatTensor(times) # since we have two modalities + return dict(avg=float(times.mean()), std=float(times.std())) + + def _get_span_prediction_time(self, query_feat, ctx_feat, ctx_mask): + """Considered two modalities""" + torch.cuda.synchronize() + st_time = time.time() + similarity = torch.einsum("md,nld->mnl", query_feat, ctx_feat) + similarity = (similarity + similarity) / 2 # (Nq, Nv, L) from query to all videos. + n_q, n_c, l = similarity.shape + similarity = similarity.view(n_q * n_c, 1, l) + st_prob = self.model.merged_st_predictor(similarity).view(n_q, n_c, l) # (Nq, Nv, L) + ed_prob = self.model.merged_ed_predictor(similarity).view(n_q, n_c, l) # (Nq, Nv, L) + st_prob = mask_logits(st_prob, ctx_mask) # (N, L) + ed_prob = mask_logits(ed_prob, ctx_mask) + torch.cuda.synchronize() + return time.time() - st_time + + def get_span_prediction_time(self): + with torch.no_grad(): + encoded_query = self.cast_dict_inputs_to_device(self.get_fake_encoded_query(), self.device)["query_feat"] + fake_ctx_inputs = self.cast_dict_inputs_to_device(self.get_fake_encoded_ctx(), self.device) + encoded_ctx = fake_ctx_inputs["ctx_feat"] + ctx_mask = fake_ctx_inputs["ctx_mask"] + times = [] + for _ in trange(self.N_Runs): + times += [self._get_span_prediction_time(encoded_query, encoded_ctx, ctx_mask)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + +""" +from baselines.profiling.profile_main import ProfileMEE +profile_mee = ProfileMEE(ctx_batch_size=400, query_batch_size=100) +profile_mee.get_ctx_encoding_time() +""" + + +class ProfileMEE(ProfileBase): + def get_model_config(self): + mee_base_cfg["ctx_mode"] = "video_sub" + mee_base_cfg["text_input_size"] = self.QueryFeatureDim + mee_base_cfg["vid_input_size"] = self.VideoFeatureDim + mee_base_cfg["output_size"] = self.HiddenSize + return mee_base_cfg + + def get_model(self): + model = MEE(self.model_config) + model.to(self.device) + model.eval() + return model + + def get_fake_raw_ctx(self): + return dict( + vid_feat=torch.FloatTensor(self.ctx_batch_size, self.VideoFeatureDim), + sub_feat=torch.FloatTensor(self.ctx_batch_size, self.QueryFeatureDim) + ) + + def get_fake_encoded_ctx_query(self): + return dict( + ctx_feat=torch.FloatTensor(self.ctx_batch_size, self.HiddenSize), + query_feat=torch.FloatTensor(self.ctx_batch_size, self.HiddenSize) + ) + + def _get_ctx_encoding_time(self, vid_feat, sub_feat): + torch.cuda.synchronize() + st_time = time.time() + self.model.video_gu(vid_feat) + self.model.sub_gu(sub_feat) + torch.cuda.synchronize() + return time.time() - st_time + + def get_ctx_encoding_time(self): + feat_dict = self.cast_dict_inputs_to_device(self.get_fake_raw_ctx(), self.device) + with torch.no_grad(): + times = [] + for _ in trange(self.N_Runs): + times += [self._get_ctx_encoding_time(**feat_dict)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + def _get_query_encoding_time(self, query_feat): + """Considered 2 modalities""" + torch.cuda.synchronize() + st_time = time.time() + pooled_query = self.model.query_pooling(query_feat) # (N, Dt) + video_query = self.model.video_query_gu(pooled_query) + sub_query = self.model.sub_query_gu(pooled_query) + stream_weights = self.model.moe_fc(pooled_query) # (N, 2) + torch.cuda.synchronize() + return time.time() - st_time + + def get_query_encoding_time(self): + raw_query = self.cast_dict_inputs_to_device(self.get_fake_raw_query(), self.device)["query_feat"] + with torch.no_grad(): + times = [] + for _ in trange(self.N_Runs): + times += [self._get_query_encoding_time(raw_query)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + def _get_retrieval_time(self, encoded_query, encoded_ctx): + """Considered 2 modalities""" + torch.cuda.synchronize() + st_time = time.time() + torch.einsum("md,nd->mn", encoded_query, encoded_ctx) # (N, N) + torch.cuda.synchronize() + return (time.time() - st_time) * 2 + + def get_retrieval_time(self): + model_inputs = self.cast_dict_inputs_to_device(self.get_fake_encoded_ctx_query(), self.device) + encoded_query = model_inputs["ctx_feat"] + encoded_ctx = model_inputs["query_feat"] + with torch.no_grad(): + times = [] + for _ in trange(self.N_Runs): + times += [self._get_retrieval_time(encoded_query, encoded_ctx)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + +class ProfileCAL(ProfileBase): + def get_model_config(self): + cal_base_cfg["ctx_mode"] = "video_sub" + cal_base_cfg["embedding_size"] = self.QueryFeatureDim + cal_base_cfg["visual_input_size"] = self.VideoFeatureDim * 2 + cal_base_cfg["textual_input_size"] = self.SubFeatureDim * 2 + cal_base_cfg["output_size"] = self.HiddenSize + return cal_base_cfg + + def get_model(self): + model = CALWithSub(self.model_config) + model.to(self.device) + model.eval() + return model + + def get_fake_raw_ctx(self, model_name="cal"): + """The features are `*2` since they use both global and local features""" + return dict( + sub_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgProposalPerVideo, + self.AvgClipPerProposal, self.SubFeatureDim * 2), + vid_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgProposalPerVideo, + self.AvgClipPerProposal, self.VideoFeatureDim * 2)) + + def _get_ctx_encoding_time(self, sub_feat, vid_feat, model_name="cal"): + if model_name == "mcn": + sub_feat = sub_feat.sum(2) + vid_feat = vid_feat.sum(2) + torch.cuda.synchronize() + st_time = time.time() + self.model.moment_encoder(vid_feat, module_name="video") + self.model.moment_encoder(sub_feat, module_name="sub") + torch.cuda.synchronize() + return time.time() - st_time + + def get_ctx_encoding_time(self, model_name="cal"): + """model_name: str, `cal` or `mcn`""" + feat_dict = self.cast_dict_inputs_to_device( + self.get_fake_raw_ctx(model_name=model_name), self.device) + feat_dict["model_name"] = model_name + with torch.no_grad(): + times = [] + for _ in trange(self.N_Runs): + times += [self._get_ctx_encoding_time(**feat_dict)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + def _get_query_encoding_time(self, query_feat, query_mask): + torch.cuda.synchronize() + st_time = time.time() + self.model.query_encoder(query_feat, query_mask) + torch.cuda.synchronize() + return time.time() - st_time + + def get_query_encoding_time(self): + feat_dict = self.cast_dict_inputs_to_device(self.get_fake_raw_query(), self.device) + with torch.no_grad(): + times = [] + for _ in trange(self.N_Runs): + times += [self._get_query_encoding_time(**feat_dict)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + +class ProfileExCL(ProfileBase): + def get_model_config(self): + excl_base_cfg["ctx_mode"] = "video_sub" + excl_base_cfg["query_input_size"] = self.QueryFeatureDim + excl_base_cfg["visual_input_size"] = self.VideoFeatureDim + excl_base_cfg["sub_input_size"] = self.SubFeatureDim + excl_base_cfg["output_size"] = self.HiddenSize + return excl_base_cfg + + def get_model(self): + model = EXCL(self.model_config) + model.to(self.device) + model.eval() + return model + + def get_fake_raw_input(self): + """The features are `*2` since they use both global and local features""" + return dict( + query_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgWordInQuery, self.QueryFeatureDim), + query_mask=torch.ones((self.ctx_batch_size, self.AvgWordInQuery)), + sub_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgClipPerVideo, self.SubFeatureDim), + sub_mask=torch.ones(self.ctx_batch_size, self.AvgClipPerVideo), + video_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgClipPerVideo, self.VideoFeatureDim), + video_mask=torch.ones(self.ctx_batch_size, self.AvgClipPerVideo), + tef_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgClipPerVideo, 2), + tef_mask=torch.ones(self.ctx_batch_size, self.AvgClipPerVideo), + st_ed_indices=torch.ones(2, 2), # not used. + ) + + def _get_prediction_time(self, input_dict): + torch.cuda.synchronize() + st_time = time.time() + self.model(**input_dict) + torch.cuda.synchronize() + return time.time() - st_time + + def get_prediction_time(self): + """model_name: str, `cal` or `mcn`""" + feat_dict = self.cast_dict_inputs_to_device( + self.get_fake_raw_input(), self.device) + feat_dict["is_training"] = False + with torch.no_grad(): + times = [] + for _ in trange(self.N_Runs): + times += [self._get_prediction_time(feat_dict)] + times = torch.FloatTensor(times) + return dict(avg=float(times.mean()), std=float(times.std())) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="") + parser.add_argument("--ctx_batch_size", type=int, default=400) + parser.add_argument("--query_batch_size", type=int, default=100) + parser.add_argument("--save_dir", type=str, default="baselines/profiling/cache") + args = parser.parse_args() + + model = args.model + query_batch_size = args.query_batch_size + ctx_batch_size = args.ctx_batch_size + if model == "mee": + profile_mee = ProfileMEE(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size) + # use the 2nd one to report time + profile_mee.get_ctx_encoding_time() + ctx_enc_time = profile_mee.get_ctx_encoding_time() + query_enc_time = profile_mee.get_query_encoding_time() + elif model == "cal": + profile_cal = ProfileCAL(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size) + # use the 2nd one to report time + profile_cal.get_ctx_encoding_time() + ctx_enc_time = profile_cal.get_ctx_encoding_time(model_name="cal") + query_enc_time = profile_cal.get_query_encoding_time() + elif model == "mcn": + profile_cal = ProfileCAL(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size) + # use the 2nd one to report time + profile_cal.get_ctx_encoding_time() + ctx_enc_time = profile_cal.get_ctx_encoding_time(model_name="mcn") + query_enc_time = profile_cal.get_query_encoding_time() + elif model == "xml": + profile_xml = ProfileXML(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size) + # use the 2nd one to report time + profile_xml.get_ctx_encoding_time() + ctx_enc_time = profile_xml.get_ctx_encoding_time() + query_enc_time = profile_xml.get_query_encoding_time() + elif model == "excl": + profile_excl = ProfileExCL(ctx_batch_size=ctx_batch_size, query_batch_size=ctx_batch_size) + # use the 2nd one to report time + profile_excl.get_prediction_time() + ctx_enc_time = profile_excl.get_prediction_time() + query_enc_time = 0 + # Calculate the total time as ctx_enc_time * (100 * 1M / ctx_batch_size) + else: + raise NotImplementedError + # ctx_enc_time = ctx_enc_time + save_path = os.path.join(args.save_dir, "{}_profile_main.json".format(model)) + + n_videos = ProfileBase.N_Videos + res = dict( + ctx_enc_time=ctx_enc_time, + ctx_enc_avg_time_all_videos=ctx_enc_time["avg"] * n_videos / ctx_batch_size, + query_enc_time=query_enc_time, + n_videos=n_videos, + ctx_batch_size=ctx_batch_size, + query_batch_size=query_batch_size, + model=model + ) + save_json(res, save_path, save_pretty=True) + pprint.pprint(res) diff --git a/baselines/profiling/profile_main.sh b/baselines/profiling/profile_main.sh new file mode 100644 index 0000000000000000000000000000000000000000..bf9235dfc1f7dd6a64079809ed250d832f7c75cf --- /dev/null +++ b/baselines/profiling/profile_main.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +model=$1 +ctx_batch_size=$2 +save_dir=baselines/profiling/cache + +python baselines/profiling/profile_main.py \ +--model ${model} \ +--ctx_batch_size ${ctx_batch_size} \ +--query_batch_size 100 \ +--save_dir ${save_dir} + diff --git a/baselines/profiling/search_time_performance.py b/baselines/profiling/search_time_performance.py new file mode 100644 index 0000000000000000000000000000000000000000..ed4b322e1098822d0c81014248469637f069e290 --- /dev/null +++ b/baselines/profiling/search_time_performance.py @@ -0,0 +1,318 @@ +""" +Compute search time needed for searching 100 new queries in a corpus containing 1M videos. +The performance reported is tested on 1.4.0.dev20191109 with Python3.7 and CUDA10.1. + +This experiment is simulated. +""" + +import os +import time +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from utils.basic_utils import save_json + +import logging + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + +np.random.seed(1234) + + +def compare_l2dist_inner_product_time(n_videos=2000, d=256, n_query=1000, n_runs=10, n_warmup_runs=10): + """In some PyTorch/Cuda Verison, torch.cdist is very slow, which affects this comparison. + See https://discuss.pytorch.org/t/cdist-vs-matmul/61682/5""" + torch.cuda.synchronize() + st_time = time.time() + fake_database = F.normalize(torch.randn((n_videos, d), dtype=torch.float32).cuda(), dim=1, p=2) + fake_query = F.normalize(torch.randn((n_query, d), dtype=torch.float32).cuda(), dim=1, p=2) + torch.cuda.synchronize() + print("Construct fake database + query time {}".format(time.time() - st_time)) + print("fake_database shape {} fake_query shape {}".format(fake_database.shape, fake_query.shape)) + + times_l2dist = [] + for _ in range(n_warmup_runs + n_runs): + torch.cuda.synchronize() + st_time = time.time() + l2_dist = torch.cdist(fake_query, fake_database, p=2) # (n_query, n_videos) + torch.cuda.synchronize() + times_l2dist.append(time.time() - st_time) + avg_time_l2dist = np.mean(times_l2dist[n_warmup_runs:]) + print("L2 Distance time {}".format(avg_time_l2dist)) + + times_ip = [] + fake_database = fake_database.transpose(0, 1) + for _ in range(n_warmup_runs + n_runs): + torch.cuda.synchronize() + st_time = time.time() + inner_product = torch.mm(fake_query, fake_database) # (n_query, n_videos) + torch.cuda.synchronize() + times_ip.append(time.time() - st_time) + avg_time_ip = np.mean(times_ip[n_warmup_runs:]) + print("Inner Product time {}".format(avg_time_ip)) + + +def run_example(): + """ + In Python, the matrices are always represented as numpy arrays. + The data type dtype must be float32. + """ + # -------------------------------- + # Step 1: Get Data + # -------------------------------- + import faiss + d = 64 # dimension + nb = 100000 # database size + nq = 10000 # nb of queries + np.random.seed(1234) # make reproducible + xb = np.random.random((nb, d)).astype('float32') + xb[:, 0] += np.arange(nb) / 1000. + xq = np.random.random((nq, d)).astype('float32') + xq[:, 0] += np.arange(nq) / 1000. + + # -------------------------------- + # Step 2: Build `Index' object + # Note some of the indexes require a training phase to analyze the data distribution. + # -------------------------------- + index = faiss.IndexFlatL2(d) # build the index + print(index.is_trained) + index.add(xb) # add vectors to the index + print(index.ntotal) + + k = 4 # we want to see 4 nearest neighbors + D, I = index.search(xb[:5], k) # sanity check + print(I) + print(D) + st_time = time.time() + D, I = index.search(xq, k) # actual search + print("time elapsed {}".format(time.time() - st_time)) + print(I[:5]) # neighbors of the 5 first queries + print(I[-5:]) # neighbors of the 5 last queries + + +def simulate_mee_runtime(n_videos=1000000, d=256, n_query=100, max_neighbors=100, n_runs=5, n_warmup_runs=10): + """ Search over a database of shape [n_videos, d] with query of shape [n_query, d]. + For each query, return max_neighbors results. + """ + import faiss + torch.cuda.synchronize() + st_time = time.time() + fake_database = faiss.rand((n_videos, d)) + fake_query = faiss.rand((n_query, d)) + torch.cuda.synchronize() + logger.info("Construct fake database + query time {}".format(time.time() - st_time)) + + torch.cuda.synchronize() + st_time = time.time() + index = faiss.index_factory(d, "IVF4096,Flat", faiss.METRIC_L2) + index_ivf = faiss.extract_index_ivf(index) + clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d)) + index_ivf.clustering_index = clustering_index + torch.cuda.synchronize() + logger.info("Build/Move to GPU? index time {}".format(time.time() - st_time)) + + st_time = time.time() + torch.cuda.synchronize() + index_ivf.train(fake_database) + torch.cuda.synchronize() + logger.info("Train index time {}".format(time.time() - st_time)) + + times = [] + for _ in range(n_warmup_runs+n_runs): + torch.cuda.synchronize() + st_time = time.time() + D, I = index_ivf.search(fake_query, max_neighbors) + torch.cuda.synchronize() + times.append(time.time() - st_time) + avg_time = np.mean(times[n_warmup_runs:]) * 2 # video + sub + logger.info("Avg searching time ({} runs) {}".format(n_runs, avg_time)) + return avg_time + + +def simulate_cal_rerank_time(n_moments=200, avg_n_clips_per_moment=7, d=256, n_query=100, max_neighbors=100, + n_runs=5, n_warmup_runs=10): + st_time = time.time() + torch.cuda.synchronize() + fake_database = torch.randn((n_moments * avg_n_clips_per_moment, d), dtype=torch.float32).cuda() + fake_query = torch.randn((n_query, d), dtype=torch.float32).cuda() + torch.cuda.synchronize() + logger.info("Construct fake database + query time {}".format(time.time() - st_time)) + + times = [] + for _ in range(n_warmup_runs+n_runs): + torch.cuda.synchronize() + st_time = time.time() + fake_dist = torch.cdist(fake_query, fake_database, p=2) + fake_dist = fake_dist.view(n_query, n_moments, avg_n_clips_per_moment).mean(2) + fake_dist = torch.cdist(fake_query, fake_database, p=2) + fake_dist = fake_dist.view(n_query, n_moments, avg_n_clips_per_moment).mean(2) # video + sub + fake_dist = fake_dist + fake_dist + fake_top_indices, fake_top_dist = torch.topk(fake_dist, k=max_neighbors, dim=1, largest=False, sorted=True) + torch.cuda.synchronize() + times.append(time.time() - st_time) + avg_time = np.mean(times[n_warmup_runs:]) + logger.info("searching time {}".format(avg_time)) + return avg_time + + +def simulate_mcn_rerank_time(n_moments=200, d=256, n_query=100, max_neighbors=100, n_runs=5, n_warmup_runs=10): + torch.cuda.synchronize() + st_time = time.time() + fake_database = torch.randn((n_moments, d), dtype=torch.float32).cuda() + fake_query = torch.randn((n_query, d), dtype=torch.float32).cuda() + torch.cuda.synchronize() + logger.info("Construct fake database + query time {}".format(time.time() - st_time)) + + times = [] + for _ in range(n_warmup_runs+n_runs): + torch.cuda.synchronize() + st_time = time.time() + fake_dist = torch.cdist(fake_query, fake_database, p=2).view(n_query, n_moments) + fake_dist = torch.cdist(fake_query, fake_database, p=2).view(n_query, n_moments) # video + sub + fake_dist = fake_dist + fake_dist + fake_top_indices, fake_top_dist = torch.topk(fake_dist, k=max_neighbors, dim=1, largest=False, sorted=True) + torch.cuda.synchronize() + times.append(time.time() - st_time) + avg_time = np.mean(times[n_warmup_runs:]) # + logger.info("searching time {}".format(avg_time)) + return avg_time + + +def simulate_xml_rerank_time(n_videos=100, avg_n_clips_per_video=20, d=256, n_query=100, max_neighbors=100, + n_runs=5, n_warmup_runs=10): + torch.cuda.synchronize() + st_time = time.time() + fake_database = torch.randn((d, n_videos*avg_n_clips_per_video), dtype=torch.float32).cuda() + fake_query = torch.randn((n_query, d), dtype=torch.float32).cuda() + conv = nn.Conv1d(in_channels=1, out_channels=2, kernel_size=5, stride=1, padding=2, bias=False).cuda() + torch.cuda.synchronize() + logger.info("Construct fake database + query time {}".format(time.time() - st_time)) + + times = dict( + conv=[], + prod=[], + topk=[], + triu=[] + ) + for _ in range(n_warmup_runs+n_runs): + torch.cuda.synchronize() + st_time = time.time() # [100, 256] [100, 20, 256] + fake_dist = torch.mm(fake_query, fake_database).view(n_query*n_videos, -1) + fake_dist = torch.mm(fake_query, fake_database).view(n_query * n_videos, -1) # video + sub + fake_dist = fake_dist + fake_dist + torch.cuda.synchronize() + times["prod"].append(time.time() - st_time) + torch.cuda.synchronize() + st_time = time.time() + fake_dist = conv(fake_dist.unsqueeze(1))[:, 0, :] + torch.cuda.synchronize() + times["conv"].append(time.time() - st_time) + torch.cuda.synchronize() + st_time = time.time() + fake_prob_prod = torch.triu(torch.einsum("ns,ne->nse", fake_dist, fake_dist)).view(n_query, -1) + torch.cuda.synchronize() + times["triu"].append(time.time() - st_time) + torch.cuda.synchronize() + st_time = time.time() + fake_top_indices, fake_top_dist = torch.topk(fake_prob_prod, k=max_neighbors, dim=1, largest=True, sorted=True) + torch.cuda.synchronize() + times["topk"].append(time.time() - st_time) + avg_time = {k: np.mean(times[k][n_warmup_runs:]) for k in times} + avg_time["all"] = np.sum(list(avg_time.values())) + logger.info("searching time {}".format(avg_time)) + return avg_time + + +def get_storage_size(hsz, n_videos, n_clips_per_video, n_moments, n_total_clips_in_moments, dtype_size=4): + """dtype_size: float32, 4B""" + GB = 1024**3 + # multiply by 2 for video+sub, xml has two level, so it has an additional 2 to multiply by. + storage = dict( + mee=n_videos * hsz * dtype_size * 2. / GB, + cal=n_total_clips_in_moments * hsz * dtype_size * 2. / GB, + mcn=n_moments * hsz * dtype_size * 2. / GB, + xml=n_videos * n_clips_per_video * hsz * dtype_size * 2. * 2. / GB + ) + print("storage (GB) {}".format(storage)) + return storage + + +def main_run(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--mode", type=str, default="mee", help="which models to simulate") + parser.add_argument("--cache_dir", type=str, default="baselines/profiling/cache", help="save index/results path") + parser.add_argument("--n_runs", type=int, default=100, help="number of runs to calc average") + parser.add_argument("--n_warmup_runs", type=int, default=10, help="number of warmup runs, to init cuda, etc.") + args = parser.parse_args() + + """ + The numbers are get from the first author of + `Temporal Localization of Moments in Video Collections with Natural Language` + """ + k = 100 + n_query = 100 + n_videos = 1000000 + n_moments_per_video = 170 + hsz = 256 + n_clips_per_video = 20 + n_total_clips_in_moments = 1170946944 + n_moments = 170000000 + max_clips_per_proposal = 14 # assume padding to this number + avg_clips_per_proposal = 7 # 6.88 + + mode = args.mode + cfg_path = os.path.join(args.cache_dir, "{}_args.json".format(mode)) + + n_runs = args.n_runs + n_warmup_runs = args.n_warmup_runs + torch.set_grad_enabled(False) + if mode in ["mee", "mee_torch"]: + func_args = dict(n_videos=n_videos, d=hsz, n_query=n_query, max_neighbors=k, + n_runs=n_runs, n_warmup_runs=n_warmup_runs) + avg_time = simulate_mee_runtime(**func_args) + elif mode == "xml_vr": + func_args = dict(n_videos=n_videos*n_clips_per_video, d=hsz, n_query=n_query, + max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs) + avg_time = simulate_mee_runtime(**func_args) + elif mode == "cal": + # can only use n_query <= 4000, so use 4000. To get 20000, simply x5 the final time. + n_cal_rerank_videos = 100 + func_args = dict(n_moments=n_cal_rerank_videos*n_moments_per_video, + avg_n_clips_per_moment=avg_clips_per_proposal, + d=hsz, n_query=n_query, max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs) + avg_time = simulate_cal_rerank_time(**func_args) + elif mode == "mcn": + n_cal_rerank_videos = 100 + func_args = dict(n_moments=n_cal_rerank_videos*n_moments_per_video, d=hsz, n_query=n_query, + max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs) + avg_time = simulate_mcn_rerank_time(**func_args) + elif mode == "xml": + n_xml_videos = 100 + func_args = dict(n_videos=n_xml_videos, avg_n_clips_per_video=n_clips_per_video, + d=hsz, n_query=n_query, max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs) + avg_time = simulate_xml_rerank_time(**func_args) + elif mode == "storage": + func_args = dict(hsz=hsz, n_videos=n_videos, n_clips_per_video=n_clips_per_video, + n_moments=n_moments, n_total_clips_in_moments=n_total_clips_in_moments, dtype_size=4) + storage = get_storage_size(**func_args) + else: + raise NotImplementedError + + if mode == "storage": + func_args["storage"] = storage + else: + func_args["n_runs"] = args.n_runs + func_args["avg_time"] = avg_time + func_args["mode"] = mode + print(func_args) + save_json(func_args, cfg_path, save_pretty=True) + + +if __name__ == '__main__': + main_run() + # compare_l2dist_inner_product_time() diff --git a/baselines/profiling/search_time_performance.sh b/baselines/profiling/search_time_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..4a33510bee6aa13f01d081123aec7892e50334a3 --- /dev/null +++ b/baselines/profiling/search_time_performance.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +mode=$1 +#dt=$(date '%Y_%m_%d_%H_%M_%S'); +#echo "$dt" +python baselines/profiling/search_time_performance.py \ +--mode ${mode} \ +--cache_dir baselines/profiling/cache + +#| tee baselines/profiling/cache/${mode}_${dt}.log \ No newline at end of file diff --git a/standalone_eval/__init__.py b/standalone_eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/standalone_eval/eval.py b/standalone_eval/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..164a18e0aa40e7973cade49f4f4dd24fcf497678 --- /dev/null +++ b/standalone_eval/eval.py @@ -0,0 +1,300 @@ +""" +Load prediction file and GT file to calculate TVR metrics: +- recall at top K (R@K), for a specified IoU, where K in [1, 5, 10, 100], IoU in [0.5, 0.7] +""" +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict, defaultdict + + +def load_json(filename): + with open(filename, "r") as f: + return json.load(f) + + +def load_jsonl(filename): + with open(filename, "r") as f: + return [json.loads(l.strip("\n")) for l in f.readlines()] + + +def pad_sequences_1d_np(sequences, dtype=np.float32): + + """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray) + into a (n+1)-d array, only allow the first dim has variable lengths. + Args: + sequences: list(n-d tensor or list) + dtype: np.dtype or torch.dtype + Returns: + padded_seqs: ((n+1)-d tensor) padded with zeros + mask: (2d tensor) of the same shape as the first two dims of padded_seqs, + 1 indicate valid, 0 otherwise + Examples: + >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] + >>> pad_sequences_1d(test_data_list, dtype=np.float32) + >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)] + >>> pad_sequences_1d(test_data_3d, dtype=np.float32) + """ + if isinstance(sequences[0], list): + sequences = [np.asarray(s, dtype=dtype) for s in sequences] + + extra_dims = sequences[0].shape[1:] # the extra dims should be the same for all elements + lengths = [len(seq) for seq in sequences] + assert "numpy" in str(dtype), "dtype and input type does not match" + padded_seqs = np.zeros((len(sequences), max(lengths)) + extra_dims, dtype=dtype) + mask = np.zeros((len(sequences), max(lengths)), dtype=np.float32) + + for idx, seq in enumerate(sequences): + end = lengths[idx] + padded_seqs[idx, :end] = seq + mask[idx, :end] = 1 + return padded_seqs, mask + + +def compute_temporal_iou_batch(preds, gt): + """ compute intersection-over-union along temporal axis + This function is significantly faster than `compute_temporal_iou`, + the result should be the same. + Args: + preds: np.ndarray, (N, 2), [st (float), ed (float)] * N + gt: [st (float), ed (float)] + Returns: + iou (float): np.ndarray, (N, ) + + References: + for np.divide with zeros, see https://stackoverflow.com/a/37977222 + """ + intersection = np.maximum(0, np.minimum(preds[:, 1], gt[1]) - np.maximum(preds[:, 0], gt[0])) + union = np.maximum(preds[:, 1], gt[1]) - np.minimum(preds[:, 0], gt[0]) # not the correct union though + return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0) + + +def get_rounded_percentage(float_number, n_floats=2): + return round(float_number * 100, n_floats) + + +TASK_TYPES = OrderedDict([ + ("VCMR", "Video Corpus Moment Retrieval"), + ("SVMR", "Single Video Moment Retrieval"), + ("VR", "regular Video Retrieval") +]) + + +def eval_by_task_type(moment_predictions, video2idx, ground_truth, + iou_thds=(0.5, 0.7), recall_topks=(1, 5, 10, 100), + task_type="SVMR", max_pred_per_query=100, match_number=True, verbose=True, use_desc_type=True): + """ a predicted triplet is positive only if: + 1) its vid_name matches the GT vid_name + 2) IoU between its timestamp and GT timestamp is higher than the given threshold + + moment_predictions w.r.t. different task_type: + For each query, evaluated on top max_pred_per_query [vid_name, st, ed] triplets. (score entry ignored) + VCMR: vid_name might be repeating. + SVMR: vid_name is fixed to be the GT vid_name. + VR: vid_name is not repeating, st and ed will not be used. + + Args: + video2idx: {vid_name (str): index (int), ...} + moment_predictions: list(dict), each dict is { + "desc": str, + "query_id": int, + "predictions": [vid_name_idx (int), st (float), ed (float), score (float)] * n_pred, + sorted predictions, n_pred could be different for all dicts. For each prediction, + only the first 3 elements [vid_name (str), st (float), ed (float),] are used, + any other following elements are ignored. We leave score here for record. + } + ground_truth: list(dict), each dict is { + "desc": str, + "query_id": int, + "type": str, one of [v, t, vt] + "vid_name": str + "ts": [st (float), ed (float)], or list([st (float), ed (float)]), len == 4. + ... + } + iou_thds: temporal IoU thresholds + recall_topks: recall at different top k + task_type: str, could be: ["VCMR", "SVMR", "VR"], see TASK_TYPES for definition. + max_pred_per_query: int, only top max_pred_per_query predictions for each query are used. + match_number: bool, must set to True if when do evaluation, False is only used for debug. + verbose: + use_desc_type: only TVR has desc type + Returns: + + """ + assert task_type in TASK_TYPES, "task_type must be one of {}".format(list(TASK_TYPES.keys())) + if verbose: + print("Running evaluation with task_type {}, n results {}; n gt {}" + .format(task_type, len(moment_predictions), len(ground_truth))) + + predictions_by_query_id = {e["query_id"]: e for e in moment_predictions} + gt_by_query_id = {e["query_id"]: e for e in ground_truth} + desc_type2idx = {"v": 0, "t": 1, "vt": 2} + desc_types = [] # n_desc + + if match_number: + assert set(gt_by_query_id.keys()) == set(predictions_by_query_id.keys()), \ + "query_ids in predictions and ground_truth must match" + # assert len(set([len(e["predictions"]) for e in predictions_by_query_id.values()])) == 1, \ + # "all queries must have the same number of predictions" + + pred_info_matrix_collection = [] + for k, gt_item in tqdm(gt_by_query_id.items(), desc="Loop over moments", leave=False): + if not match_number and k not in predictions_by_query_id: + continue + pred_info_matrix = np.array( + [e[:3] for e in predictions_by_query_id[k]["predictions"]][:max_pred_per_query], + dtype=np.float32) # (n_pred, 3) + if use_desc_type: + desc_types.append(desc_type2idx[gt_item["type"]]) + vid_name_matched_pred = pred_info_matrix[:, 0] == video2idx[gt_item["vid_name"]] # bool, (n_pred, ) + pred_info_matrix = np.concatenate([pred_info_matrix, vid_name_matched_pred[:, None]], axis=1) # (n_pred, 4) + + # add 1 + len(iou_thds) columns, iou_scores, iou_corrects for each iou_thd. + iou_thd_corrects_columns = [] + if len(gt_item["ts"]) >= 4: # didemo, fro all 3 splits, at least 4 ts for each, < 0.5% has more than 4. + least_n_overlap = 2 # True if overlapped with at least least_n_overlap GT ts. + iou_corrects_dict = defaultdict(list) + for single_gt_ts in gt_item["ts"]: + single_gt_ts = np.array(single_gt_ts, dtype=np.float32) # (2, ) + # iou scores of the predictions that have wrong vid_name are set to 0. + iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred + for iou_thd in iou_thds: + iou_corrects_dict[iou_thd].append(iou_scores >= iou_thd) + for iou_thd in iou_thds: + iou_corrects = sum(iou_corrects_dict[iou_thd]) >= least_n_overlap # bool, (n_pred, ) + iou_thd_corrects_columns.append(iou_corrects[:, None]) + + else: # should be 2, len([st, ed]) == 2 + single_gt_ts = np.array(gt_item["ts"], dtype=np.float32) # (2, ) + # iou scores of the predictions that have wrong vid_name are set to 0. + iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred + + for iou_thd in iou_thds: + iou_corrects = iou_scores >= iou_thd # bool, (n_pred, ) + iou_thd_corrects_columns.append(iou_corrects[:, None]) + + pred_info_matrix = np.concatenate([pred_info_matrix, ] + iou_thd_corrects_columns, axis=1) # (n_pred, 6) + pred_info_matrix_collection.append(pred_info_matrix) + + # column header [vid_name_idx (int), st (float), ed (float), is_vid_name_match (bool), + # iou_scores>=iou_thd0 (bool), iou_scores>=iou_thd1 (bool)] + pred_info_matrix_collection = pad_sequences_1d_np(pred_info_matrix_collection)[0] # (n_desc, n_pred, 6) + if use_desc_type: + desc_types = np.array(desc_types) # (n_desc) + + # results wrapper + metrics = OrderedDict() + metrics_by_type = OrderedDict() + + iou_c_offset = 4 # iou_corrects column index starts here + if task_type == "VCMR": + for iou_idx, iou_thd in enumerate(iou_thds): + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) # (n_desc, n_pred) + # 1) there might be more than one positive clip, so use `>= 1` + for k in recall_topks: + metrics["{}-r{}".format(iou_thd, k)] = \ + get_rounded_percentage(np.mean(np.sum(iou_corrects[:, :k], axis=1) >= 1)) + if use_desc_type: + for desc_type in desc_type2idx: + type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) + n_desc_in_type = np.sum(type_corrects) # (n_desc) + for iou_idx, iou_thd in enumerate(iou_thds): + # (n_desc, n_pred) + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) + for k in recall_topks: + metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage( + 1.0 * np.sum(np.logical_and(np.sum(iou_corrects[:, :k], axis=1) >= 1, type_corrects)) + / n_desc_in_type + ) + elif task_type == "SVMR": + vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool) # (n_desc, n_pred) + n_desc = len(vid_name_matched) + for iou_idx, iou_thd in enumerate(iou_thds): + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) # (n_desc, n_pred) + # 1) there might be more than one positive clip, so use `>= 1` + for k in recall_topks: + metrics["{}-r{}".format(iou_thd, k)] = get_rounded_percentage(np.mean( + [np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 for idx in range(n_desc)] + )) + if use_desc_type: + for desc_type in desc_type2idx: + type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) + n_desc_in_type = np.sum(type_corrects) # (n_desc) + for iou_idx, iou_thd in enumerate(iou_thds): + # (n_desc, n_pred) + iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) + # 1) there might be more than one positive clip, so use `>= 1` + for k in recall_topks: + metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage( + 1.0 * np.sum([np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 and type_corrects[idx] + for idx in range(n_desc)]) + / n_desc_in_type) + + elif task_type == "VR": + vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool) # (n_desc, n_pred) + for k in recall_topks: + metrics["r{}".format(k)] = \ + get_rounded_percentage(np.mean(np.sum(vid_name_matched[:, :k], axis=1) >= 1)) + if use_desc_type: + for desc_type in desc_type2idx: + type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) + n_desc_in_type = np.sum(type_corrects) # (n_desc) + for k in recall_topks: + metrics_by_type["{}-r{}".format(desc_type, k)] = get_rounded_percentage( + 1.0 * np.sum(np.logical_and(np.sum(vid_name_matched[:, :k], axis=1) >= 1, type_corrects)) + / n_desc_in_type) + else: + raise ValueError("task_type wrong.") + if use_desc_type: + metrics_by_type["desc_type_ratio"] = "v {} t {} vt {}"\ + .format(*[get_rounded_percentage(1.0 * np.sum(desc_types == desc_type2idx[k]) / len(desc_types)) + for k in ["v", "t", "vt"]]) + return metrics, metrics_by_type + + +def eval_retrieval(submission, ground_truth, iou_thds=(0.5, 0.7), verbose=True, match_number=True, use_desc_type=True): + video2idx = submission["video2idx"] + submitted_task_types = [k for k in TASK_TYPES if k in submission] + if verbose: + print("Evaluating for task {}".format(submitted_task_types)) + eval_metrics = OrderedDict() + metrics_raw_dict = {} + for task_type in submitted_task_types: + metrics, metrics_by_type = eval_by_task_type( + submission[task_type], video2idx, ground_truth, + iou_thds=iou_thds, recall_topks=(1, 5, 10, 100), + task_type=task_type, max_pred_per_query=100, + match_number=match_number, verbose=verbose, use_desc_type=use_desc_type) + metrics_raw_dict[task_type] = metrics + metrics_raw_dict[task_type+"_by_type"] = metrics_by_type + + for task_type in submitted_task_types: + eval_metrics[task_type] = metrics_raw_dict[task_type] + if use_desc_type: + for task_type in submitted_task_types: + eval_metrics[task_type+"_by_type"] = metrics_raw_dict[task_type+"_by_type"] + return eval_metrics + + +def eval_main(): + import argparse + parser = argparse.ArgumentParser(description="TVR Evaluation Script") + parser.add_argument("--submission_path", type=str, help="path to generated prediction file") + parser.add_argument("--gt_path", type=str, help="path to GT file") + parser.add_argument("--save_path", type=str, help="path to save the results") + parser.add_argument("--not_verbose", action="store_true") + args = parser.parse_args() + + verbose = not args.not_verbose + submission = load_json(args.submission_path) + gt = load_jsonl(args.gt_path) + results = eval_retrieval(submission, gt, iou_thds=(0.5, 0.7), verbose=verbose) + if verbose: + print(json.dumps(results, indent=4)) + + with open(args.save_path, "w") as f: + f.write(json.dumps(results, indent=4)) + + +if __name__ == '__main__': + eval_main() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/basic_utils.py b/utils/basic_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1b319182fb7136058290bc3a9e2309fbe01c0db6 --- /dev/null +++ b/utils/basic_utils.py @@ -0,0 +1,206 @@ +import os +import json +import zipfile +import numpy as np +import pickle + + +def load_pickle(filename): + with open(filename, "rb") as f: + return pickle.load(f) + + +def save_pickle(data, filename): + with open(filename, "wb") as f: + pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) + + +def load_json(filename): + with open(filename, "r") as f: + return json.load(f) + + +def save_json(data, filename, save_pretty=False, sort_keys=False): + with open(filename, "w") as f: + if save_pretty: + f.write(json.dumps(data, indent=4, sort_keys=sort_keys)) + else: + json.dump(data, f) + + +def load_jsonl(filename): + with open(filename, "r") as f: + return [json.loads(l.strip("\n")) for l in f.readlines()] + + +def save_jsonl(data, filename): + """data is a list""" + with open(filename, "w") as f: + f.write("\n".join([json.dumps(e) for e in data])) + + +def save_lines(list_of_str, filepath): + with open(filepath, "w") as f: + f.write("\n".join(list_of_str)) + + +def read_lines(filepath): + with open(filepath, "r") as f: + return [e.strip("\n") for e in f.readlines()] + + +def mkdirp(p): + if not os.path.exists(p): + os.makedirs(p) + + +def flat_list_of_lists(l): + """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]""" + return [item for sublist in l for item in sublist] + + +def convert_to_seconds(hms_time): + """ convert '00:01:12' to 72 seconds. + :hms_time (str): time in comma separated string, e.g. '00:01:12' + :return (int): time in seconds, e.g. 72 + """ + times = [float(t) for t in hms_time.split(":")] + return times[0] * 3600 + times[1] * 60 + times[2] + + +def get_video_name_from_url(url): + return url.split("/")[-1][:-4] + + +def merge_dicts(list_dicts): + merged_dict = list_dicts[0].copy() + for i in range(1, len(list_dicts)): + merged_dict.update(list_dicts[i]) + return merged_dict + + +def l2_normalize_np_array(np_array, eps=1e-5): + """np_array: np.ndarray, (*, D), where the last dim will be normalized""" + return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps) + + +def make_zipfile(src_dir, save_path, enclosing_dir="", exclude_dirs=None, exclude_extensions=None, + exclude_dirs_substring=None): + """make a zip file of root_dir, save it to save_path. + exclude_paths will be excluded if it is a subdir of root_dir. + An enclosing_dir is added is specified. + """ + abs_src = os.path.abspath(src_dir) + with zipfile.ZipFile(save_path, "w") as zf: + for dirname, subdirs, files in os.walk(src_dir): + if exclude_dirs is not None: + for e_p in exclude_dirs: + if e_p in subdirs: + subdirs.remove(e_p) + if exclude_dirs_substring is not None: + to_rm = [] + for d in subdirs: + if exclude_dirs_substring in d: + to_rm.append(d) + for e in to_rm: + subdirs.remove(e) + arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:]) + zf.write(dirname, arcname) + for filename in files: + if exclude_extensions is not None: + if os.path.splitext(filename)[1] in exclude_extensions: + continue # do not zip it + absname = os.path.join(dirname, filename) + arcname = os.path.join(enclosing_dir, absname[len(abs_src) + 1:]) + zf.write(absname, arcname) + + +class AverageMeter(object): + """Computes and stores the average and current/max/min value""" + def __init__(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + self.max = -1e10 + self.min = 1e10 + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + self.max = -1e10 + self.min = 1e10 + + def update(self, val, n=1): + self.max = max(val, self.max) + self.min = min(val, self.min) + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True): + """Dissect an array (N, D) into a list a sub-array, + np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept""" + if assert_equal: + assert len(np_array) == sum(lengths) + length_indices = [0, ] + for i in range(len(lengths)): + length_indices.append(length_indices[i] + lengths[i]) + if dim == 0: + array_list = [np_array[length_indices[i]:length_indices[i+1]] for i in range(len(lengths))] + elif dim == 1: + array_list = [np_array[:, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))] + elif dim == 2: + array_list = [np_array[:, :, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))] + else: + raise NotImplementedError + return array_list + + +import time +import logging +import os + +def get_logger(dir, tile): + os.makedirs(dir, exist_ok=True) + log_file = time.strftime("%Y%m%d_%H%M%S", time.localtime()) + log_file = os.path.join(dir, "{}_{}.log".format(log_file, tile)) + + logger = logging.getLogger() + logger.setLevel('DEBUG') + BASIC_FORMAT = "%(levelname)s:%(message)s" + # DATE_FORMAT = '%Y-%m-%d %H:%M:%S' + formatter = logging.Formatter(BASIC_FORMAT) + chlr = logging.StreamHandler() + chlr.setFormatter(formatter) + + fhlr = logging.FileHandler(log_file) + fhlr.setFormatter(formatter) + fhlr.setLevel('INFO') + + logger.addHandler(chlr) + logger.addHandler(fhlr) + return logger + +def get_ratio_from_counter(counter_obj, threshold=200): + keys = counter_obj.keys() + values = counter_obj.values() + filtered_values = [counter_obj[k] for k in keys if k > threshold] + return float(sum(filtered_values)) / sum(values) + + +def get_show_name(vid_name): + """ + get tvshow name from vid_name + :param vid_name: video clip name + :return: tvshow name + """ + show_list = ["friends", "met", "castle", "house", "grey"] + vid_name_prefix = vid_name.split("_")[0] + show_name = vid_name_prefix if vid_name_prefix in show_list else "bbt" + return show_name diff --git a/utils/find_best_epoch.py b/utils/find_best_epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..7d13277255212e453cd1a63f37b6df0d4ce108ba --- /dev/null +++ b/utils/find_best_epoch.py @@ -0,0 +1,21 @@ +def rewrite_epoch(filename, new_file_name): + max_value = float(-100) + new_file = [] + + with open(filename, 'r') as file: + for line in file: + new_file.append(line) + if line.startswith("INFO:VAL"): + anchor = float(line.split()[5]) # Assuming the value is at the 5th index + if anchor > max_value: + max_value = anchor + print(max_value) + new_file.append("BEST: " + line) + + with open(new_file_name, 'w') as file: + file.writelines(new_file) + +# Example usage +filename = "results/XML_top40_20240704_170747/20240704_170747_XML_top40.log" +new_file_name = "results/XML_top40_20240704_170747/new.log" +best_epoch = rewrite_epoch(filename, new_file_name) diff --git a/utils/mk_video_split_with_duration.py b/utils/mk_video_split_with_duration.py new file mode 100644 index 0000000000000000000000000000000000000000..ab5a524174febeb4515e511dc33c10a74c212d84 --- /dev/null +++ b/utils/mk_video_split_with_duration.py @@ -0,0 +1,18 @@ +from utils.basic_utils import load_json, save_json + + +def combine(video_name_split_path, video_duration_path, save_path): + video_name_split = load_json(video_name_split_path) + video_duration_dict = load_json(video_duration_path) + + combined_dict = {} + for split_name, split_video_names in video_name_split.items(): + combined_dict[split_name] = {vid_name: video_duration_dict[vid_name] + for vid_name in split_video_names} + save_json(combined_dict, save_path) + + +if __name__ == '__main__': + import sys + combine(*sys.argv[1:]) + diff --git a/utils/model_utils.py b/utils/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ef498bc7005bfa162047ba3a2c49196a95017f --- /dev/null +++ b/utils/model_utils.py @@ -0,0 +1,105 @@ +__author__ = "Jie Lei" + +# ref: https://github.com/lichengunc/MAttNet/blob/master/lib/layers/lang_encoder.py#L11 +# ref: https://github.com/easonnie/flint/blob/master/torch_util.py#L272 +import torch +import torch.nn as nn +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + + +class RNNEncoder(nn.Module): + """A RNN wrapper handles variable length inputs, always set batch_first=True. + Supports LSTM, GRU and RNN. Tested with PyTorch 0.3 and 0.4 + """ + def __init__(self, word_embedding_size, hidden_size, bidirectional=True, + dropout_p=0, n_layers=1, rnn_type="lstm", + return_hidden=True, return_outputs=True, + allow_zero=False): + super(RNNEncoder, self).__init__() + """ + :param word_embedding_size: rnn input size + :param hidden_size: rnn output size + :param dropout_p: between rnn layers, only useful when n_layer >= 2 + """ + self.allow_zero = allow_zero + self.rnn_type = rnn_type + self.n_dirs = 2 if bidirectional else 1 + # - add return_hidden keyword arg to reduce computation if hidden is not needed. + self.return_hidden = return_hidden + self.return_outputs = return_outputs + self.rnn = getattr(nn, rnn_type.upper())(word_embedding_size, hidden_size, n_layers, + batch_first=True, + bidirectional=bidirectional, + dropout=dropout_p) + + def sort_batch(self, seq, lengths): + sorted_lengths, perm_idx = lengths.sort(0, descending=True) + if self.allow_zero: # deal with zero by change it to one. + sorted_lengths[sorted_lengths == 0] = 1 + reverse_indices = [0] * len(perm_idx) + for i in range(len(perm_idx)): + reverse_indices[perm_idx[i]] = i + sorted_seq = seq[perm_idx] + return sorted_seq, list(sorted_lengths), reverse_indices + + def forward(self, inputs, lengths): + """ + inputs, sorted_inputs -> (B, T, D) + lengths -> (B, ) + outputs -> (B, T, n_dirs * D) + hidden -> (n_layers * n_dirs, B, D) -> (B, n_dirs * D) keep the last layer + - add total_length in pad_packed_sequence for compatiblity with nn.DataParallel, --remove it + """ + assert len(inputs) == len(lengths) + sorted_inputs, sorted_lengths, reverse_indices = self.sort_batch(inputs, lengths) + packed_inputs = pack_padded_sequence(sorted_inputs, sorted_lengths, batch_first=True) + outputs, hidden = self.rnn(packed_inputs) + if self.return_outputs: + # outputs, lengths = pad_packed_sequence(outputs, batch_first=True, total_length=int(max(lengths))) + outputs, lengths = pad_packed_sequence(outputs, batch_first=True) + outputs = outputs[reverse_indices] + else: + outputs = None + if self.return_hidden: # + if self.rnn_type.lower() == "lstm": + hidden = hidden[0] + hidden = hidden[-self.n_dirs:, :, :] + hidden = hidden.transpose(0, 1).contiguous() + hidden = hidden.view(hidden.size(0), -1) + hidden = hidden[reverse_indices] + else: + hidden = None + return outputs, hidden + + +def pool_across_time(outputs, lengths, pool_type="max"): + """ Get maximum responses from RNN outputs along time axis + :param outputs: (B, T, D) + :param lengths: (B, ) + :param pool_type: str, 'max' or 'mean' + :return: (B, D) + """ + if pool_type == "max": + outputs = [outputs[i, :int(lengths[i]), :].max(dim=0)[0] for i in range(len(lengths))] + elif pool_type == "mean": + outputs = [outputs[i, :int(lengths[i]), :].mean(dim=0) for i in range(len(lengths))] + else: + raise NotImplementedError("Only support mean and max pooling") + return torch.stack(outputs, dim=0) + + +def count_parameters(model, verbose=True): + """Count number of parameters in PyTorch model, + References: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7. + + from utils.utils import count_parameters + count_parameters(model) + import sys + sys.exit(1) + """ + n_all = sum(p.numel() for p in model.parameters()) + n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + if verbose: + print("Parameter Count: all {:,d}; trainable {:,d}".format(n_all, n_trainable)) + return n_all, n_trainable + diff --git a/utils/temporal_nms.py b/utils/temporal_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..545ed8045d7da4a6a831395029e39c0f803025d5 --- /dev/null +++ b/utils/temporal_nms.py @@ -0,0 +1,74 @@ +""" +Non-Maximum Suppression for video proposals. +""" + + +def compute_temporal_iou(pred, gt): + """ deprecated due to performance concerns + compute intersection-over-union along temporal axis + Args: + pred: [st (float), ed (float)] + gt: [st (float), ed (float)] + Returns: + iou (float): + + Ref: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py + """ + intersection = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0])) + union = max(pred[1], gt[1]) - min(pred[0], gt[0]) # not the correct union though + if union == 0: + return 0 + else: + return 1.0 * intersection / union + + +def temporal_non_maximum_suppression(predictions, nms_threshold, max_after_nms=100): + """ + Args: + predictions: list(sublist), each sublist is [st (float), ed(float), score (float)], + note larger scores are better and are preserved. For metrics that are better when smaller, + please convert to its negative, e.g., convert distance to negative distance. + nms_threshold: float in [0, 1] + max_after_nms: + Returns: + predictions_after_nms: list(sublist), each sublist is [st (float), ed(float), score (float)] + References: + https://github.com/wzmsltw/BSN-boundary-sensitive-network/blob/7b101fc5978802aa3c95ba5779eb54151c6173c6/Post_processing.py#L42 + """ + if len(predictions) == 1: # only has one prediction, no need for nms + return predictions + + predictions = sorted(predictions, key=lambda x: x[2], reverse=True) # descending order + + tstart = [e[0] for e in predictions] + tend = [e[1] for e in predictions] + tscore = [e[2] for e in predictions] + rstart = [] + rend = [] + rscore = [] + while len(tstart) > 1 and len(rscore) < max_after_nms: # max 100 after nms + idx = 1 + while idx < len(tstart): # compare with every prediction in the list. + if compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]) > nms_threshold: + # rm highly overlapped lower score entries. + tstart.pop(idx) + tend.pop(idx) + tscore.pop(idx) + # print("--------------------------------") + # print(compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]])) + # print([tstart[0], tend[0]], [tstart[idx], tend[idx]]) + # print(tstart.pop(idx), tend.pop(idx), tscore.pop(idx)) + else: + # move to next + idx += 1 + rstart.append(tstart.pop(0)) + rend.append(tend.pop(0)) + rscore.append(tscore.pop(0)) + + if len(rscore) < max_after_nms and len(tstart) >= 1: # add the last, possibly empty. + rstart.append(tstart.pop(0)) + rend.append(tend.pop(0)) + rscore.append(tscore.pop(0)) + + predictions_after_nms = [[st, ed, s] for s, st, ed in zip(rscore, rstart, rend)] + return predictions_after_nms diff --git a/utils/tensor_utils.py b/utils/tensor_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..72497127fdbbd935bfc8c42b5fae723db04d73f8 --- /dev/null +++ b/utils/tensor_utils.py @@ -0,0 +1,141 @@ +import numpy as np +import torch + + +def pad_sequences_1d(sequences, dtype=torch.long, device=torch.device("cpu"), fixed_length=None): + """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray) + into a (n+1)-d array, only allow the first dim has variable lengths. + Args: + sequences: list(n-d tensor or list) + dtype: np.dtype or torch.dtype + device: + fixed_length: pad all seq in sequences to fixed length. All seq should have a length <= fixed_length. + return will be of shape [len(sequences), fixed_length, ...] + Returns: + padded_seqs: ((n+1)-d tensor) padded with zeros + mask: (2d tensor) of the same shape as the first two dims of padded_seqs, + 1 indicate valid, 0 otherwise + Examples: + >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] + >>> pad_sequences_1d(test_data_list, dtype=torch.long) + >>> test_data_3d = [torch.randn(2,3,4), torch.randn(4,3,4), torch.randn(1,3,4)] + >>> pad_sequences_1d(test_data_3d, dtype=torch.float) + >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] + >>> pad_sequences_1d(test_data_list, dtype=np.float32) + >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)] + >>> pad_sequences_1d(test_data_3d, dtype=np.float32) + """ + if isinstance(sequences[0], list): + if "torch" in str(dtype): + sequences = [torch.tensor(s, dtype=dtype, device=device) for s in sequences] + else: + sequences = [np.asarray(s, dtype=dtype) for s in sequences] + + extra_dims = sequences[0].shape[1:] # the extra dims should be the same for all elements + lengths = [len(seq) for seq in sequences] + if fixed_length is not None: + max_length = fixed_length + else: + max_length = max(lengths) + if isinstance(sequences[0], torch.Tensor): + assert "torch" in str(dtype), "dtype and input type does not match" + padded_seqs = torch.zeros((len(sequences), max_length) + extra_dims, dtype=dtype, device=device) + mask = torch.zeros((len(sequences), max_length), dtype=torch.float32, device=device) + else: # np + assert "numpy" in str(dtype), "dtype and input type does not match" + padded_seqs = np.zeros((len(sequences), max_length) + extra_dims, dtype=dtype) + mask = np.zeros((len(sequences), max_length), dtype=np.float32) + + for idx, seq in enumerate(sequences): + end = lengths[idx] + padded_seqs[idx, :end] = seq + mask[idx, :end] = 1 + return padded_seqs, mask # , lengths + + +def pad_sequences_2d(sequences, dtype=torch.long): + """ Pad a double-nested list or a sequence of n-d torch tensor into a (n+1)-d tensor, + only allow the first two dims has variable lengths + Args: + sequences: list(n-d tensor or list) + dtype: torch.long for word indices / torch.float (float32) for other cases + Returns: + Examples: + >>> test_data_list = [[[1, 3, 5], [3, 7, 4, 1]], [[98, 34, 11, 89, 90], [22], [34, 56]],] + >>> pad_sequences_2d(test_data_list, dtype=torch.long) # torch.Size([2, 3, 5]) + >>> test_data_3d = [torch.randn(2,2,4), torch.randn(4,3,4), torch.randn(1,5,4)] + >>> pad_sequences_2d(test_data_3d, dtype=torch.float) # torch.Size([2, 3, 5]) + >>> test_data_3d2 = [[torch.randn(2,4), ], [torch.randn(3,4), torch.randn(5,4)]] + >>> pad_sequences_2d(test_data_3d2, dtype=torch.float) # torch.Size([2, 3, 5]) + # TODO add support for numpy array + """ + bsz = len(sequences) + para_lengths = [len(seq) for seq in sequences] + max_para_len = max(para_lengths) + sen_lengths = [[len(word_seq) for word_seq in seq] for seq in sequences] + max_sen_len = max([max(e) for e in sen_lengths]) + + if isinstance(sequences[0], torch.Tensor): + extra_dims = sequences[0].shape[2:] + elif isinstance(sequences[0][0], torch.Tensor): + extra_dims = sequences[0][0].shape[1:] + else: + sequences = [[torch.Tensor(word_seq, dtype=dtype) for word_seq in seq] for seq in sequences] + extra_dims = () + + padded_seqs = torch.zeros((bsz, max_para_len, max_sen_len) + extra_dims, dtype=dtype) + mask = torch.zeros(bsz, max_para_len, max_sen_len).float() + + for b_i in range(bsz): + for sen_i, sen_l in enumerate(sen_lengths[b_i]): + padded_seqs[b_i, sen_i, :sen_l] = sequences[b_i][sen_i] + mask[b_i, sen_i, :sen_l] = 1 + return padded_seqs, mask # , sen_lengths + + +def find_max_triples(st_prob, ed_prob, top_n=5, prob_thd=None, tensor_type="torch"): + """ Find a list of (k1, k2) where k1 < k2 with the maximum values of st_prob[k1] * ed_prob[k2] + Args: + st_prob (torch.Tensor or np.ndarray): (N, L) batched start_idx probabilities + ed_prob (torch.Tensor or np.ndarray): (N, L) batched end_idx probabilities + top_n (int): return topN pairs with highest values + prob_thd (float): + tensor_type: str, np or torch + Returns: + batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...] + """ + if tensor_type == "torch": + st_prob, ed_prob = st_prob.data.numpy(), ed_prob.data.numpy() + product = np.einsum("bm,bn->bmn", st_prob, ed_prob) + # (N, L, L) the lower part becomes zeros, start_idx < ed_idx + upper_product = np.triu(product, k=1) + return find_max_triples_from_upper_triangle_product(upper_product, top_n=top_n, prob_thd=prob_thd) + + +def find_max_triples_from_upper_triangle_product(upper_product, top_n=5, prob_thd=None): + """ Find a list of (k1, k2) where k1 < k2 with the maximum values of p1[k1] * p2[k2] + Args: + upper_product (torch.Tensor or np.ndarray): (N, L, L), the lower part becomes zeros, end_idx > start_idx + top_n (int): return topN pairs with highest values + prob_thd (float or None): + Returns: + batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...] + """ + batched_sorted_triple = [] + for idx, e in enumerate(upper_product): + sorted_triple = top_n_array_2d(e, top_n=top_n) + if prob_thd is not None: + sorted_triple = sorted_triple[sorted_triple[2] >= prob_thd] + batched_sorted_triple.append(sorted_triple) + return batched_sorted_triple + + +def top_n_array_2d(array_2d, top_n): + """ Get topN indices and values of a 2d array, return a tuple of indices and their values, + ranked by the value + """ + row_indices, column_indices = np.unravel_index(np.argsort(array_2d, axis=None), array_2d.shape) + row_indices = row_indices[::-1][:top_n] + column_indices = column_indices[::-1][:top_n] + sorted_values = array_2d[row_indices, column_indices] + return np.stack([row_indices, column_indices, sorted_values], axis=1) # (N, 3)