Liangrj5 commited on Jul 7, 2024

Commit

ebf5d87

1 Parent(s): 6dd9459

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

baselines/__init__.py +0 -0
baselines/__pycache__/__init__.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/README.md +25 -0
baselines/clip_alignment_with_language/__init__.py +0 -0
baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/config.py +207 -0
baselines/clip_alignment_with_language/inference.py +672 -0
baselines/clip_alignment_with_language/local_utils/__init__.py +0 -0
baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc +0 -0
baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py +117 -0
baselines/clip_alignment_with_language/local_utils/proposal.py +181 -0
baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt +61 -0
baselines/clip_alignment_with_language/mix_model_prediction.py +86 -0
baselines/clip_alignment_with_language/model.py +299 -0
baselines/clip_alignment_with_language/proposal_retrieval_dataset.py +587 -0
baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh +23 -0
baselines/clip_alignment_with_language/scripts/inference.sh +17 -0
baselines/clip_alignment_with_language/scripts/inference_mix.sh +27 -0
baselines/clip_alignment_with_language/scripts/inference_with_external.sh +54 -0
baselines/clip_alignment_with_language/scripts/re_train_cal.sh +21 -0
baselines/clip_alignment_with_language/scripts/re_train_mcn.sh +21 -0
baselines/clip_alignment_with_language/scripts/train.sh +80 -0
baselines/clip_alignment_with_language/train.py +310 -0
baselines/crossmodal_moment_localization/README.md +2 -0
baselines/crossmodal_moment_localization/__init__.py +0 -0
baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc +0 -0
baselines/crossmodal_moment_localization/config.py +276 -0
baselines/crossmodal_moment_localization/inference.py +414 -0
baselines/crossmodal_moment_localization/model_components.py +317 -0
baselines/crossmodal_moment_localization/model_xml.py +642 -0
baselines/crossmodal_moment_localization/ndcg_iou_topk.py +68 -0
baselines/crossmodal_moment_localization/optimization.py +338 -0
baselines/crossmodal_moment_localization/scripts/eval.sh +14 -0
baselines/crossmodal_moment_localization/scripts/inference.sh +18 -0
baselines/crossmodal_moment_localization/scripts/inference_with_external.sh +40 -0
baselines/crossmodal_moment_localization/scripts/train.sh +70 -0
baselines/crossmodal_moment_localization/start_end_dataset.py +393 -0

baselines/__init__.py ADDED Viewed

File without changes

baselines/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (176 Bytes). View file

baselines/clip_alignment_with_language/README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Clip Alignment With Language
+This folder contains the CAL model described in the paper
+```
+@article{Escorcia2019TemporalLO,
+  title={Temporal Localization of Moments in Video Collections with Natural Language},
+  author={Victor Escorcia and Mattia Soldan and Josef Sivic and Bernard Ghanem and Bryan Russell},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1907.12763}
+}
+```
+It also resembles the MCN model in
+```
+@article{Hendricks2017LocalizingMI,
+  title={Localizing Moments in Video with Natural Language},
+  author={Lisa Anne Hendricks and Oliver Wang and Eli Shechtman and Josef Sivic and Trevor Darrell and Bryan C. Russell},
+  journal={2017 IEEE International Conference on Computer Vision (ICCV)},
+  year={2017},
+  pages={5804-5813}
+}
+```
+Disclaimer: This code is implemented by [Jie Lei](http://www.cs.unc.edu/~jielei/) for the TVR dataset,
+it does not guarantee the reproducibility of the original authors' results.

baselines/clip_alignment_with_language/__init__.py ADDED Viewed

File without changes

baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (205 Bytes). View file

baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (17.8 kB). View file

baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc ADDED Viewed

Binary file (43 kB). View file

baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc ADDED Viewed

Binary file (37 kB). View file

baselines/clip_alignment_with_language/config.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import time
+import torch
+import argparse
+from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
+from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs
+class BaseOptions(object):
+    saved_option_filename = "opt.json"
+    ckpt_filename = "model.ckpt"
+    tensorboard_log_dir = "tensorboard_log"
+    train_log_filename = "train.log.txt"
+    eval_log_filename = "eval.log.txt"
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        self.initialized = False
+        self.opt = None
+    def initialize(self):
+        self.initialized = True
+        self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
+        self.parser.add_argument("--eval_split_name", type=str, default="val",
+                                 help="should match keys in corpus_path, must set for VCMR")
+        self.parser.add_argument("--debug", action="store_true",
+                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
+        self.parser.add_argument("--data_ratio", type=float, default=1.0,
+                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
+                                      "Use small portion for debug purposes. Note this is different from --debug, "
+                                      "which works by breaking the loops, typically they are not used together.")
+        self.parser.add_argument("--results_root", type=str, default="results")
+        self.parser.add_argument("--exp_id", type=str, default="res", help="id of the current run")
+        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
+        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
+        self.parser.add_argument("--num_workers", type=int, default=8,
+                                 help="num subprocesses used to load the data, 0: use main process")
+        self.parser.add_argument("--no_core_driver", action="store_true",
+                                 help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
+        self.parser.add_argument("--no_pin_memory", action="store_true",
+                                 help="Don't use pin_memory=True for dataloader. "
+                                      "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")
+        # training config
+        self.parser.add_argument("--lr", type=float, default=0.05, help="learning rate")
+        self.parser.add_argument("--wd", type=float, default=0, help="weight decay")
+        self.parser.add_argument("--momentum", type=float, default=0.95, help="momentum for SGD")
+        self.parser.add_argument("--n_epoch", type=int, default=108, help="number of epochs to run")
+        self.parser.add_argument("--max_es_cnt", type=int, default=108, help="number of epochs to early stop")
+        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+        self.parser.add_argument("--eval_query_bsz", type=int, default=1000,
+                                 help="mini-batch size at inference, for query")
+        self.parser.add_argument("--eval_proposal_bsz", type=int, default=200,
+                                 help="mini-batch size at inference, for proposals")
+        self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
+        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
+        self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss")
+        self.parser.add_argument("--inter_loss_weight", type=float, default=0.4, help="margin for ranking loss")
+        self.parser.add_argument("--loss_type", type=str, default="hinge", choices=["hinge", "lse"],
+                                 help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
+        # Model and Data config
+        self.parser.add_argument("--max_sub_l", type=int, default=50,
+                                 help="max length of all sub sentence 97.71 under 50 for 3 sentences")
+        self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
+        self.parser.add_argument("--pos_iou_thd", type=float, default=0.7, help="moments with IoU >= as positive")
+        self.parser.add_argument("--neg_iou_thd", type=float, default=0.35, help="moments with IoU < as negative")
+        self.parser.add_argument("--train_path", type=str, default=None)
+        self.parser.add_argument("--eval_path", type=str, default=None,
+                                 help="Evaluating during training, for Dev set. If None, will only do training, "
+                                      "anet_cap and charades_sta has no dev set, so None")
+        self.parser.add_argument("--external_train_vr_res_path", type=str, default=None,
+                                 help="if set, use external video retrieval results to guide "
+                                      "inter-nvideo negative sampling. ")
+        self.parser.add_argument("--init_ckpt_path", type=str, default=None,
+                                 help="init model parameters from checkpoint. Use absolute path")
+        self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None,
+                                 help="if set, use external video retrieval results to guide evaluation. ")
+        self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features")
+        self.parser.add_argument("--word2idx_path", type=str,
+                                 help="a dict, {word: word_idx, ...}, "
+                                      "special tokens are {<pad>: 0, <unk>: 1, <eos>: 2}")
+        self.parser.add_argument("--vocab_size", type=int, default=-1,
+                                 help="Set automatically to len(word2idx)")
+        self.parser.add_argument("--glove_path", type=str,
+                                 help="path to file containing the GloVe embeddings for words in word2idx")
+        self.parser.add_argument("--desc_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--desc_feat_size", type=int, default=768)
+        self.parser.add_argument("--ctx_mode", type=str,
+                                 choices=["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"],
+                                 help="which context to use. a combination of [video, sub, tef]")
+        self.parser.add_argument("--corpus_path", type=str, default=None)
+        self.parser.add_argument("--vid_feat_path", type=str, default="")
+        self.parser.add_argument("--no_norm_vfeat", action="store_true",
+                                 help="Do not do normalization on video feat, use it when using i3d_resnet concat feat")
+        self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
+        self.parser.add_argument("--clip_length", type=float, default=None,
+                                 help="each video will be uniformly segmented into small clips, "
+                                      "will automatically loaded from ProposalConfigs if None")
+        self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
+        self.parser.add_argument("--model_type", default="cal", choices=["cal", "mcn"])
+        self.parser.add_argument("--embedding_size", type=int, default=768)
+        self.parser.add_argument("--lstm_hidden_size", type=int, default=256)
+        self.parser.add_argument("--visual_hidden_size", type=int, default=256)
+        self.parser.add_argument("--output_size", type=int, default=256)
+        # post processing
+        self.parser.add_argument("--nms_thd", type=float, default=-1,
+                                 help="additionally use non-maximum suppression "
+                                      "(or non-minimum suppression for distance)"
+                                      "to post-processing the predictions. "
+                                      "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,")
+        self.parser.add_argument("--max_after_nms", type=int, default=100, help="Stores at max_after_nms for eval")
+        self.parser.add_argument("--max_before_nms", type=int, default=300, help="Max before nms")
+        self.parser.add_argument("--use_intermediate", action="store_true",
+                                 help="Whether to use/save intermediate results to results directory."
+                                      "Might want use this if we are going to ")
+    def save_args(self, opt):
+        args = vars(opt)
+        # Save settings
+        if not isinstance(self, TestOptions):
+            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
+            save_json(args, option_file_path, save_pretty=True)
+    def parse(self):
+        if not self.initialized:
+            self.initialize()
+        opt = self.parser.parse_args()
+        if opt.debug:
+            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
+            opt.no_core_driver = True
+            opt.num_workers = 0
+        if isinstance(self, TestOptions):
+            # modify model_dir to absolute path
+            opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
+            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
+            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
+                if arg not in ["results_root", "num_workers", "nms_thd", "debug", "eval_split_name", "eval_path",
+                               "use_intermediate", "external_inference_vr_res_path"]:
+                    setattr(opt, arg, saved_options[arg])
+            # opt.no_core_driver = True
+        else:
+            if opt.exp_id is None:
+                raise ValueError("--exp_id is required for at a training option!")
+            if opt.clip_length is None:
+                opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
+            opt.results_dir = os.path.join(opt.results_root,
+                                           "-".join([opt.dset_name, opt.model_type, opt.ctx_mode, opt.exp_id,
+                                                     time.strftime("%Y_%m_%d_%H_%M_%S")]))
+            mkdirp(opt.results_dir)
+            # save a copy of current code
+            code_dir = os.path.dirname(os.path.realpath(__file__))
+            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
+            make_zipfile(code_dir, code_zip_filename,
+                         enclosing_dir="code",
+                         exclude_dirs_substring="results",
+                         exclude_dirs=["results", "debug_results", "__pycache__"],
+                         exclude_extensions=[".pyc", ".ipynb", ".swap"])
+        self.save_args(opt)
+        if "sub" in opt.ctx_mode:
+            assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
+        if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
+            assert opt.no_norm_vfeat
+        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
+        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
+        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
+        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
+        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
+        opt.h5driver = None if opt.no_core_driver else "core"
+        # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
+        opt.pin_memory = not opt.no_pin_memory
+        opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
+        # Display settings
+        print("------------ Options -------------\n{}\n-------------------"
+              .format({str(k): str(v) for k, v in sorted(vars(opt).items())}))
+        self.opt = opt
+        return opt
+class TestOptions(BaseOptions):
+    """add additional options for evaluating"""
+    def initialize(self):
+        BaseOptions.initialize(self)
+        # also need to specify --eval_split_name
+        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
+        self.parser.add_argument("--model_dir", type=str,
+                                 help="dir contains the model file, will be converted to absolute path afterwards")
+        self.parser.add_argument("--tasks", type=str, nargs="+", choices=["VCMR", "SVMR", "VR"], default="SVMR",
+                                 help="Which tasks to run."
+                                      "VCMR: Video Corpus Moment Retrieval;"
+                                      "SVMR: Single Video Moment Retrieval;"
+                                      "VR: regular Video Retrieval.")

baselines/clip_alignment_with_language/inference.py ADDED Viewed

	@@ -0,0 +1,672 @@

+import os
+import time
+import math
+import pprint
+import numpy as np
+from tqdm import tqdm, trange
+from collections import defaultdict, OrderedDict
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from baselines.clip_alignment_with_language.config import TestOptions
+from baselines.clip_alignment_with_language.model import CALWithSub
+from baselines.clip_alignment_with_language.proposal_retrieval_dataset import \
+    proposal_retrieval_collate, ProposalRetrievalEvalDataset, prepare_batch_inputs
+from utils.basic_utils import save_jsonl, save_json, load_json
+from utils.temporal_nms import temporal_non_maximum_suppression
+from utils.tensor_utils import pad_sequences_1d
+from standalone_eval.eval import eval_retrieval
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+def combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list):
+    """
+    Args:
+        proposals_embedding_list: list(torch.Tensor), bsz * (N_prop, N_clips, D_o)
+        proposals_mask_list: list(torch.Tensor), bsz * (N_prop, N_clips)
+    """
+    if len(proposals_embedding_list) == 1:
+        return proposals_embedding_list[0], proposals_mask_list[0]
+    else:  # > 1
+        max_n_clips = max([e.shape[1] for e in proposals_embedding_list])
+        n_proposals = sum([len(e) for e in proposals_embedding_list])
+        d = proposals_embedding_list[0].shape[2]
+        proposals_embedding = proposals_embedding_list[0].new_zeros((n_proposals, max_n_clips, d))
+        proposals_mask = proposals_mask_list[0].new_zeros((n_proposals, max_n_clips))
+        mask_lengths = [0, ] + [len(m) for m in proposals_mask_list]
+        mask_cumsum_lengths = np.cumsum(mask_lengths)
+        for idx, (e, m) in enumerate(zip(proposals_embedding_list, proposals_mask_list)):
+            proposals_embedding[mask_cumsum_lengths[idx]:mask_cumsum_lengths[idx + 1], :e.shape[1]] = e
+            proposals_mask[mask_cumsum_lengths[idx]:mask_cumsum_lengths[idx + 1], :m.shape[1]] = m
+        return proposals_embedding, proposals_mask
+def compute_query_embeddings(model, eval_dataset, opt, load_gt_vid_name):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated size 20,000 (query) * 100 (hsz) * 4 / (1024**2) = 7.63 MB
+    """
+    model.eval()
+    eval_dataset.set_data_mode("query")
+    eval_dataset.load_gt_vid_name_for_query(load_gt_vid_name)
+    query_eval_loader = DataLoader(eval_dataset,
+                                   collate_fn=proposal_retrieval_collate,
+                                   batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers,
+                                   shuffle=False,
+                                   pin_memory=opt.pin_memory)
+    global_meta_list = []  # list(dicts)
+    # n_query = min(len(eval_dataset), opt.eval_query_bsz) if opt.debug else len(eval_dataset)
+    n_query = len(eval_dataset)
+    global_query_embedding = torch.empty((n_query,
+                                          model.config.output_size),
+                                         dtype=torch.float32, device=opt.device)  # (N_q, D_o)
+    for idx, batch in tqdm(enumerate(query_eval_loader),
+                           desc="Computing q embedding",
+                           total=len(query_eval_loader)):
+        global_meta_list.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        global_query_embedding[idx * opt.eval_query_bsz: (idx + 1) * opt.eval_query_bsz] = \
+            model.query_encoder(**model_inputs)
+        if opt.debug:
+            break
+    return global_meta_list, global_query_embedding
+def compute_proposal_embeddings(model, eval_dataset, opt):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated 1000 (videos) * 300 (proposals) * 20 (clips) * 100 (hsz) * 4 / (1024 ** 3) = 2.24 GB
+    """
+    model.eval()
+    eval_dataset.set_data_mode("context")
+    global_meta_list = []  # list(dicts)
+    global_proposal_video_embedding_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips, D_o]
+    global_proposal_sub_embedding_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips, D_o]
+    global_proposal_video_mask_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips]
+    global_proposal_sub_mask_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips]
+    for idx, single_video_info in tqdm(enumerate(eval_dataset),
+                                       desc="Computing prop embedding for videos",
+                                       total=len(eval_dataset)):
+        global_meta_list.append(single_video_info["meta"])
+        if model.use_video or model.tef_only:
+            proposals_features_list = single_video_info["model_inputs"]["video_moment_features_list"]
+            proposals_mask_list = single_video_info["model_inputs"]["video_moment_mask_list"]
+            proposals_mask_list = [e.to(opt.device, non_blocking=opt.pin_memory) for e in proposals_mask_list]
+            proposals_embedding_list = []  # (N_prop, D_o)
+            for feat in proposals_features_list:
+                proposals_embedding_list.append(
+                    model.moment_encoder(feat.to(opt.device, non_blocking=opt.pin_memory), module_name="video"))
+            p, m = combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list)
+            global_proposal_video_embedding_list.append(p)
+            global_proposal_video_mask_list.append(m)
+        else:
+            global_proposal_video_embedding_list.append(None)
+        if model.use_sub:
+            proposals_features_list = single_video_info["model_inputs"]["sub_moment_features_list"]
+            proposals_mask_list = single_video_info["model_inputs"]["sub_moment_mask_list"]
+            proposals_mask_list = [e.to(opt.device, non_blocking=opt.pin_memory) for e in proposals_mask_list]
+            proposals_embedding_list = []  # (N_prop, D_o)
+            for feat in proposals_features_list:
+                proposals_embedding_list.append(
+                    model.moment_encoder(feat.to(opt.device, non_blocking=opt.pin_memory), module_name="sub"))
+            p, m = combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list)
+            global_proposal_sub_embedding_list.append(p)
+            global_proposal_sub_mask_list.append(m)
+        else:
+            global_proposal_sub_embedding_list.append(None)
+        if opt.debug and idx == 100:
+            break
+    global_proposal_mask_list = global_proposal_sub_mask_list if model.use_sub else global_proposal_video_mask_list
+    return global_meta_list, global_proposal_video_embedding_list, \
+           global_proposal_sub_embedding_list, global_proposal_mask_list
+def compute_query_proposal_distance(model, eval_dataset, opt, tasks=("SVMR",)):
+    """compute and save query and video proposal embeddings,
+    tasks: SVMR (single video moment retrieval), VCMR (video corpus moment retrieval)
+    """
+    is_svmr = "SVMR" in tasks
+    is_vcmr = "VCMR" in tasks
+    query_meta_list, query_embed = compute_query_embeddings(model, eval_dataset, opt,
+                                                            load_gt_vid_name=is_svmr)
+    video_meta_list, video_prop_embed_list, sub_prop_embed_list, prop_mask_list = \
+        compute_proposal_embeddings(model, eval_dataset, opt)
+    eval_res = dict(
+        query_meta=query_meta_list,  # N_q * dict()
+        video_meta=video_meta_list,  # N_videos * dict()
+        video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+        query_prop_dist_vcmr=[],  # N_videos * (N_q, N_prop), note N_prop is changing for each video.
+        query_prop_dist_svmr=[],  # N_q * (N_prop, ), each query has a GT video, no need to calc. for all.
+    )
+    if is_vcmr:
+        for v_prop_embed, s_prop_embed, prop_mask in tqdm(
+                zip(video_prop_embed_list, sub_prop_embed_list, prop_mask_list),
+                desc="Computing VCMR q to prop dist for videos",
+                total=len(video_prop_embed_list)):
+            query_prop_dist = model.compute_cdist_inference(
+                query_embed, v_prop_embed, s_prop_embed, prop_mask)  # (N_q, N_prop)
+            eval_res["query_prop_dist_vcmr"].append(query_prop_dist.cpu())
+            if opt.debug:
+                break
+    if is_svmr:
+        if opt.debug:
+            debug_query_meta = []
+        # this is different from video2idx
+        svmr_video2meta_idx = {e["vid_name"]: idx for idx, e in enumerate(video_meta_list)}
+        # logger.info("svmr_video2idx {}".format(list(svmr_video2idx.keys())[:3]))
+        for single_q_embed, single_q_meta in tqdm(zip(query_embed, query_meta_list),
+                                                  desc="Computing SVMR q to prop dist for videos",
+                                                  total=len(query_embed)):
+            # logger.info("single_q_meta[vid_name] {}".format(single_q_meta["vid_name"]))
+            if opt.debug:
+                if single_q_meta["vid_name"] not in svmr_video2meta_idx:
+                    continue
+                debug_query_meta.append(single_q_meta)
+            q_gt_vid_meta_idx = svmr_video2meta_idx[single_q_meta["vid_name"]]
+            v_prop_embed = video_prop_embed_list[q_gt_vid_meta_idx]  # [N_prop, N_clips, D_o]
+            s_prop_embed = sub_prop_embed_list[q_gt_vid_meta_idx]  # [N_prop, N_clips, D_o]
+            prop_mask = prop_mask_list[q_gt_vid_meta_idx]  # [N_prop, N_clips]
+            query_prop_dist = model.compute_cdist_inference(
+                single_q_embed.unsqueeze(0), v_prop_embed, s_prop_embed, prop_mask)  # (1, N_prop)
+            eval_res["query_prop_dist_svmr"].append(query_prop_dist.squeeze(0).cpu().numpy())
+        if opt.debug:
+            eval_res["query_meta"] = debug_query_meta
+    return eval_res
+def filter_vcmr_by_nms(all_video_predictions, nms_threshold=0.6,
+                       max_before_nms=1000, max_after_nms=100, score_col_idx=3):
+    """ Apply non-maximum suppression for all the predictions for each video.
+    1) group predictions by video index
+    2) apply nms individually for each video index group
+    3) combine and sort the predictions
+    Args:
+        all_video_predictions: list(sublist),
+            Each sublist is [video_idx (int), st (float), ed(float), score (float)]
+            Note the scores are negative distances.
+        nms_threshold: float
+        max_before_nms: int
+        max_after_nms: int
+        score_col_idx: int
+    Returns:
+    """
+    predictions_neg_by_video_group = defaultdict(list)
+    for pred in all_video_predictions[:max_before_nms]:
+        predictions_neg_by_video_group[pred[0]].append(pred[1:])  # [st (float), ed(float), score (float)]
+    predictions_by_video_group_neg_after_nms = dict()
+    for video_idx, grouped_preds in predictions_neg_by_video_group.items():
+        predictions_by_video_group_neg_after_nms[video_idx] = \
+            temporal_non_maximum_suppression(grouped_preds, nms_threshold=nms_threshold)
+    predictions_after_nms = []
+    for video_idx, grouped_preds in predictions_by_video_group_neg_after_nms.items():
+        for pred in grouped_preds:
+            pred = [video_idx] + pred  # [video_idx (int), st (float), ed(float), score (float)]
+            predictions_after_nms.append(pred)
+    # ranking happens across videos
+    predictions_after_nms = sorted(predictions_after_nms,
+                                   key=lambda x: x[score_col_idx],
+                                   reverse=True)[:max_after_nms]  # descending order
+    return predictions_after_nms
+def post_processing_vcmr_nms(vcmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100):
+    """
+    vcmr_res: list(dict), each dict is{
+        "desc": str,
+        "desc_id": int,
+        "predictions": list(sublist)  # each sublist is
+            [video_idx (int), st (float), ed(float), score (float)], video_idx could be different
+    }
+    """
+    processed_vcmr_res = []
+    for e in vcmr_res:
+        e["predictions"] = filter_vcmr_by_nms(e["predictions"],
+                                              nms_threshold=nms_thd,
+                                              max_before_nms=max_before_nms,
+                                              max_after_nms=max_after_nms)
+        processed_vcmr_res.append(e)
+    return processed_vcmr_res
+def post_processing_svmr_nms(svmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100):
+    """
+    svmr_res: list(dict), each dict is
+        {"desc": str,
+         "desc_id": int,
+         "predictions": list(sublist)  # each sublist is
+            [video_idx (int), st (float), ed(float), score (float)], video_idx is the same.
+         }
+    """
+    processed_svmr_res = []
+    for e in svmr_res:
+        # the predictions are sorted inside the nms func.
+        _predictions = [d[1:] for d in e["predictions"][:max_before_nms]]
+        _predictions = temporal_non_maximum_suppression(
+            _predictions, nms_threshold=nms_thd)[:max_after_nms]
+        _video_id = e["predictions"][0][0] # video_id is the same for all predictions
+        e["predictions"] = [[_video_id, ] + d for d in _predictions]
+        processed_svmr_res.append(e)
+    return processed_svmr_res
+def generate_vcmr_predictions_from_res_with_external(eval_res, max_prop_per_query=300, query_bsz_in_sort=1000):
+    """ This function is for Video Corpus Moment Retrieval (VCMR).
+    Generate prediction file which could be evaluated using standalone_eval.eval.
+    Args:
+        eval_res: dict(
+            query_meta=query_meta_list,  # N_q * dict(), each dict is {"desc_id": int, "desc": str}
+            video_meta=video_meta_list,  # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray}
+            video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+            video_bsz_in_sort=[],  # N_videos * (N_q, N_prop)
+        )
+        max_prop_per_query: int or None. If None, generate ranking for all possible moments, else generate top {}.
+        query_bsz_in_sort: int, only sort a subset of queries at a time, it will be too large to sort all queries.
+    return:
+        list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)),
+            each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance.
+    """
+    # video2idx
+    video2idx = eval_res["video2idx"]
+    video_meta = eval_res["video_meta"]
+    query_meta = eval_res["query_meta"]
+    video_idx2meta_idx = {video2idx[m["vid_name"]]: i for i, m in enumerate(video_meta)}
+    external_query2video = eval_res["external_query2video"] if "external_query2video" in eval_res else None
+    # 「query idx： [video meta idx]」
+    external_query2video_meta_idx = {k: [video_idx2meta_idx[e] for e in v] for k, v in external_query2video.items()}
+    external_ordered_video_meta_indices = torch.LongTensor(
+        [external_query2video_meta_idx[e["desc_id"]] for e in query_meta])  # (Nq, 5)
+    top_n_retrieved = external_ordered_video_meta_indices.shape[1]
+    # (N_videos, N_prop, N_q), (N_videos, N_prop)
+    padded_dist, padded_mask = pad_sequences_1d([e.transpose(0, 1) for e in eval_res["query_prop_dist_vcmr"]],
+                                                dtype=eval_res["query_prop_dist_vcmr"][0].dtype,
+                                                device=eval_res["query_prop_dist_vcmr"][0].device)
+    # putting 'NaN' into the invalid bits, torch.sort considers 'NaN' as larger than any number!!!
+    padded_dist += (padded_mask.unsqueeze(2) == 0).float() * 1e10
+    n_videos, n_prop, n_q = padded_dist.shape
+    padded_dist = padded_dist.permute(2, 0, 1)  # (N_q, N_videos, N_prop)
+    # get only top retrieved, N_videos now decreased to top_n_retrieved
+    row_indices = torch.arange(n_q, device=padded_dist.device)
+    padded_dist = torch.stack([
+        padded_dist[row_indices, external_ordered_video_meta_indices[:, col_idx]]
+        for col_idx in range(top_n_retrieved)], dim=1)  # (N_q, 5, N_prop)
+    n_videos = top_n_retrieved
+    padded_dist = padded_dist.view(n_q, -1).contiguous()  # (N_q, N_video*N_prop)
+    print("n_videos, n_prop, n_q {}".format((n_videos, n_prop, n_q)))
+    print("padded_dist, {}".format(padded_dist.shape))
+    sorted_distances, sorted_indices = torch.topk(padded_dist.to(torch.device("cuda:0"), non_blocking=True),
+                                                  k=min(max_prop_per_query, n_videos * n_prop),
+                                                  dim=1, largest=False, sorted=True)  # (N_q, max_prop_per_query) * 2
+    print("orted_distances {}, sorted_indices {}".format(sorted_distances.shape, sorted_indices.shape))
+    sorted_distances = - sorted_distances.cpu().numpy()
+    # (N_q, max_prop_per_query) * 2, prop_indices: inside video indices.
+    video_meta_indices_retrieved = torch.floor(sorted_indices.float() / n_prop).long().cpu().numpy()
+    # map back to original video idx (not video meta idx, but real video idx)
+    video_indices = np.array([[external_query2video[query_meta[i]["desc_id"]][j] for j in r]
+                              for i, r in enumerate(video_meta_indices_retrieved)])  # (N_q, max_prop_per_query)
+    prop_indices = torch.remainder(sorted_indices, n_prop).cpu().numpy()  # (N_q, max_prop_per_query)
+    print("video_indices {}, prop_indices {}".format(video_indices.shape, prop_indices.shape))
+    vr_res = []
+    for i in trange(n_q, desc="[VR] Loop over queries to generate predictions"):
+        row = video_indices[i]
+        score_row = - sorted_distances[i]
+        cur_vr_redictions = []
+        for j, video_idx in enumerate(row):
+            cur_vr_redictions.append([int(video_idx), 0, 0, float(score_row[j])])
+        cur_query_pred = dict(
+            desc_id=query_meta[i]["desc_id"],
+            desc=query_meta[i]["desc"],
+            predictions=cur_vr_redictions
+        )
+        vr_res.append(cur_query_pred)
+    vcmr_res = []
+    logger.debug("sorted_indices {}".format(sorted_indices.shape))
+    logger.debug("sorted_distances {}".format(sorted_distances.shape))
+    out_bounds_cnt = 0
+    for idx, (v_row_indices, p_row_indices) in tqdm(enumerate(zip(video_indices, prop_indices)),
+                                                    desc="[VCMR] Loop over queries to generate predictions",
+                                                    total=n_q):  # query
+        sorted_distances_row = - sorted_distances[idx]  # converted to negative distance
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = []
+        for col_idx, (v_col_idx, p_col_idx) in enumerate(zip(v_row_indices, p_row_indices)):
+            cur_proposals = eval_res["video_meta"][video_idx2meta_idx[v_col_idx]]["proposals"]
+            cur_pred = []
+            cur_pred += [int(v_col_idx), ]
+            # what is wrong with the indexing below??? (out of bounds), but results seems fine???
+            # Not a bug. Since there might be less than max_before_nms proposals from the top retrieved videos
+            if p_col_idx >= len(cur_proposals):
+                out_bounds_cnt += 1
+                p_col_idx = len(cur_proposals)-1
+            cur_pred += cur_proposals[p_col_idx].tolist()
+            cur_pred += [float(sorted_distances_row[col_idx])]
+            cur_ranked_predictions.append(cur_pred)
+        cur_query_pred = dict(
+            desc_id=eval_res["query_meta"][idx]["desc_id"],
+            desc=eval_res["query_meta"][idx]["desc"],
+            predictions=cur_ranked_predictions
+        )
+        vcmr_res.append(cur_query_pred)
+    logger.info("[DEBUG] out_bounds_cnt {}".format(out_bounds_cnt))
+    return vcmr_res, vr_res
+def generate_vcmr_predictions_from_res(eval_res, max_prop_per_query=300, query_bsz_in_sort=1000):
+    """ This function is for Video Corpus Moment Retrieval (VCMR).
+    Generate prediction file which could be evaluated using standalone_eval.eval.
+    Args:
+        eval_res: dict(
+            query_meta=query_meta_list,  # N_q * dict(), each dict is {"desc_id": int, "desc": str}
+            video_meta=video_meta_list,  # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray}
+            video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+            video_bsz_in_sort=[],  # N_videos * (N_q, N_prop)
+        )
+        max_prop_per_query: int or None. If None, generate ranking for all possible moments, else generate top {}.
+        query_bsz_in_sort: int, only sort a subset of queries at a time, it will be too large to sort all queries.
+    return:
+        list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)),
+            each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance.
+    """
+    # video2idx
+    video2idx = eval_res["video2idx"]
+    # (N_videos, N_prop, N_q), (N_videos, N_prop)
+    padded_dist, padded_mask = pad_sequences_1d([e.transpose(0, 1) for e in eval_res["query_prop_dist_vcmr"]],
+                                                dtype=eval_res["query_prop_dist_vcmr"][0].dtype,
+                                                device=eval_res["query_prop_dist_vcmr"][0].device)
+    # putting 'NaN' into the invalid bits, torch.sort considers 'NaN' as larger than any number!!!
+    padded_dist += (padded_mask.unsqueeze(2) == 0).float() * 1e10
+    n_videos, n_prop, n_q = padded_dist.shape
+    print("n_videos, n_prop, n_q {}".format((n_videos, n_prop, n_q)))
+    padded_dist = padded_dist.view(n_videos * n_prop, n_q).transpose(0, 1).contiguous()  # (N_q, N_video*N_prop)
+    print("padded_dist, {}".format(padded_dist.shape))
+    sorted_distances, sorted_indices = torch.topk(padded_dist.to(torch.device("cuda:0"), non_blocking=True),
+                                                  k=min(max_prop_per_query, n_videos * n_prop),
+                                                  dim=1, largest=False, sorted=True)  # (N_q, max_prop_per_query) * 2
+    sorted_distances = - sorted_distances.cpu().numpy()
+    # (N_q, max_prop_per_query) * 2, prop_indices: inside video indices.
+    video_meta_indices = torch.floor(sorted_indices.float() / n_prop).long().cpu().numpy()
+    prop_indices = torch.remainder(sorted_indices, n_prop).cpu().numpy()
+    vr_res = []
+    query_meta = eval_res["query_meta"]
+    for i in trange(n_q, desc="[VR] Loop over queries to generate predictions"):
+        row = video_meta_indices[i]
+        score_row = - sorted_distances[i]
+        cur_vr_redictions = []
+        for j, meta_idx in enumerate(row):
+            video_idx = video2idx[eval_res["video_meta"][meta_idx]["vid_name"]]
+            cur_vr_redictions.append([video_idx, 0, 0, float(score_row[j])])
+        cur_query_pred = dict(
+            desc_id=query_meta[i]["desc_id"],
+            desc=query_meta[i]["desc"],
+            predictions=cur_vr_redictions
+        )
+        vr_res.append(cur_query_pred)
+    vcmr_res = []
+    logger.debug("sorted_indices {}".format(sorted_indices.shape))
+    logger.debug("sorted_distances {}".format(sorted_distances.shape))
+    for idx, (vm_row_indices, p_row_indices) in tqdm(enumerate(zip(video_meta_indices, prop_indices)),
+                                                     desc="[VCMR] Loop over queries to generate predictions",
+                                                     total=n_q):  # query
+        sorted_distances_row = - sorted_distances[idx]  # converted to negative distance
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = []
+        for col_idx, (v_col_idx, p_col_idx) in enumerate(zip(vm_row_indices, p_row_indices)):
+            cur_pred = []
+            cur_pred += [video2idx[eval_res["video_meta"][v_col_idx]["vid_name"]], ]
+            cur_pred += eval_res["video_meta"][v_col_idx]["proposals"][p_col_idx].tolist()
+            cur_pred += [float(sorted_distances_row[col_idx])]
+            cur_ranked_predictions.append(cur_pred)
+        cur_query_pred = dict(
+            desc_id=eval_res["query_meta"][idx]["desc_id"],
+            desc=eval_res["query_meta"][idx]["desc"],
+            predictions=cur_ranked_predictions
+        )
+        vcmr_res.append(cur_query_pred)
+    return vcmr_res, vr_res
+def generate_svmr_predictions_from_res(eval_res, max_prop_per_query=None):
+    """ This function is for Video Corpus Moment Retrieval (VCMR).
+    Generate prediction file which could be evaluated using standalone_eval.eval.
+    Args:
+        eval_res: dict(
+            query_meta=query_meta_list,  # N_q * dict(), each dict is {"desc_id": int, "desc": str}
+            video_meta=video_meta_list,  # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray}
+            video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+            query_prop_dist_svmr=[],  # N_q * (N_prop, )
+        )
+        max_prop_per_query: not used
+    return:
+        list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)),
+            each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance.
+    """
+    video2idx = eval_res["video2idx"]
+    svmr_res = []
+    svmr_video2meta_idx = {e["vid_name"]: idx for idx, e in enumerate(eval_res["video_meta"])}
+    for idx, (q_p_dist, q_m) in tqdm(enumerate(zip(eval_res["query_prop_dist_svmr"], eval_res["query_meta"])),
+                                     desc="Loop over queries to generate predictions",
+                                     total=len(eval_res["query_prop_dist_svmr"])):  # query
+        sorted_indices = np.argsort(q_p_dist)  # (N_prop, )  # ascending order, distance
+        if max_prop_per_query is not None:
+            sorted_indices = sorted_indices[:max_prop_per_query]
+        v_eval_idx = video2idx[q_m["vid_name"]]
+        v_meta_idx = svmr_video2meta_idx[q_m["vid_name"]]
+        proposals = eval_res["video_meta"][v_meta_idx]["proposals"]  # (N_p, 2)
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = [
+            [v_eval_idx, ] + proposals[sort_idx].tolist() + [- round(float(q_p_dist[sort_idx]), 4), ]
+            for sort_idx in sorted_indices]
+        cur_query_pred = dict(
+            desc_id=q_m["desc_id"],
+            desc=q_m["desc"],
+            predictions=cur_ranked_predictions
+        )
+        svmr_res.append(cur_query_pred)
+    return svmr_res
+POST_PROCESSING_MMS_FUNC = {
+    "SVMR": post_processing_svmr_nms,
+    "VCMR": post_processing_vcmr_nms
+}
+def get_submission_top_n(submission, top_n=100):
+    def get_prediction_top_n(list_dict_predictions, top_n):
+        top_n_res = []
+        for e in list_dict_predictions:
+            e["predictions"] = e["predictions"][:top_n]
+            top_n_res.append(e)
+        return top_n_res
+    top_n_submission = dict(video2idx=submission["video2idx"], )
+    for k in submission:
+        if k != "video2idx":
+            top_n_submission[k] = get_prediction_top_n(submission[k], top_n)
+    return top_n_submission
+def load_external_vr_res(external_vr_res_path, top_n_vr_videos=5):
+    """return a mapping from desc_id to top retrieved video id"""
+    external_vr_res = load_json(external_vr_res_path)
+    external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"]
+    query2video = {e["desc_id"]: [sub_e[0] for sub_e in e["predictions"]] for e in external_vr_res}
+    return query2video
+def eval_epoch(model, eval_dataset, opt, save_submission_filename,
+               tasks=("SVMR",), max_before_nms=1000, max_after_nms=100):
+    model.eval()
+    logger.info("Computing scores")
+    logger.info("Start timing")
+    # times = []  # do not use
+    # for _ in range(3):
+    #     st_time = time.time()
+    if opt.use_intermediate:
+        intermediate_cache_path = os.path.join(opt.results_dir, "{}_eval_res.pt".format(opt.eval_split_name))
+        if not os.path.exists(intermediate_cache_path):
+            logger.info("Saving intermediate results {}.".format(intermediate_cache_path))
+            eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks)
+            torch.save(eval_res, intermediate_cache_path)
+        else:
+            logger.info("Loading intermediate results {}.".format(intermediate_cache_path))
+            eval_res = torch.load(intermediate_cache_path)
+    else:
+        logger.info("Running without saving intermediate results, you might want to turn on --use_intermediate.")
+        eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks)
+    # del model  # We dont need model anymore
+    # eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks)
+    logger.info("Generating predictions from scores")
+    eval_submission_raw = dict(video2idx=eval_res["video2idx"])
+    if "SVMR" in tasks:
+        eval_submission_raw["SVMR"] = generate_svmr_predictions_from_res(
+            eval_res, max_prop_per_query=max_before_nms)
+    # vcmr_loading_time = 0
+    if "VCMR" in tasks:
+        if opt.external_inference_vr_res_path is not None:
+            logger.info("Using external VR results from {}".format(opt.external_inference_vr_res_path))
+            # vcmr_loading_time = time.time()
+            eval_res["external_query2video"] = load_external_vr_res(
+                opt.external_inference_vr_res_path, top_n_vr_videos=5)
+            # vcmr_loading_time = time.time() - vcmr_loading_time
+            vcmr_res, vr_res = generate_vcmr_predictions_from_res_with_external(
+                eval_res, max_prop_per_query=max_before_nms)
+        else:
+            vcmr_res, vr_res = generate_vcmr_predictions_from_res(
+                eval_res, max_prop_per_query=max_before_nms)
+        eval_submission_raw["VCMR"] = vcmr_res
+        eval_submission_raw["VR"] = vr_res
+        # times += [time.time() - st_time - vcmr_loading_time]
+    # times = torch.FloatTensor(times)
+    IOU_THDS = (0.5, 0.7)
+    logger.info("Saving/Evaluating before nms results")
+    submission_path = os.path.join(opt.results_dir, save_submission_filename)
+    eval_submission = get_submission_top_n(eval_submission_raw, top_n=max_after_nms)
+    if max_after_nms < 1000:
+        save_json(eval_submission, submission_path)
+    else:
+        torch.save(eval_submission, submission_path.replace(".json", ".pt"))
+    metrics = eval_retrieval(eval_submission, eval_dataset.query_data,
+                             iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug,
+                             use_desc_type=opt.dset_name == "tvr")
+    # metrics["time_avg"] = float(times.mean())
+    # metrics["time_std"] = float(times.std())
+    save_metrics_path = submission_path.replace(".json", "_metrics.json")
+    save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False)
+    latest_file_paths = [submission_path, save_metrics_path]
+    if opt.nms_thd != -1:
+        logger.info("Performing nms with nms_thd {}".format(opt.nms_thd))
+        eval_submission_after_nms = dict(video2idx=eval_submission_raw["video2idx"])
+        for k, nms_func in POST_PROCESSING_MMS_FUNC.items():
+            if k in eval_submission_raw:
+                eval_submission_after_nms[k] = nms_func(eval_submission_raw[k],
+                                                        nms_thd=opt.nms_thd,
+                                                        max_before_nms=max_before_nms,
+                                                        max_after_nms=max_after_nms)
+        logger.info("Saving/Evaluating nms results")
+        submission_nms_path = submission_path.replace(".json", "_nms_thd_{}.json".format(opt.nms_thd))
+        save_json(eval_submission_after_nms, submission_nms_path)
+        metrics_nms = eval_retrieval(eval_submission_after_nms, eval_dataset.query_data,
+                                     iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug)
+        save_metrics_nms_path = submission_nms_path.replace(".json", "_metrics.json")
+        save_json(metrics_nms, save_metrics_nms_path, save_pretty=True, sort_keys=False)
+        latest_file_paths += [submission_nms_path, save_metrics_nms_path]
+    else:
+        metrics_nms = None
+    return metrics, metrics_nms, latest_file_paths
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    model = CALWithSub(checkpoint["model_cfg"])
+    model.load_state_dict(checkpoint["model"])
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+    assert opt.eval_path is not None
+    eval_dataset = ProposalRetrievalEvalDataset(
+        dset_name=opt.dset_name,
+        model_type=opt.model_type,
+        eval_split_name=opt.eval_split_name,  # should only be val set
+        data_path=opt.eval_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        corpus_path=opt.corpus_path,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        eval_proposal_bsz=opt.eval_proposal_bsz,
+        ctx_mode=opt.ctx_mode,
+        data_mode="query",
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+    )
+    model = setup_model(opt)
+    save_submission_filename = \
+        "inference_{}_{}_{}_predictions_{}.json".format(
+            opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks))
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename, tasks=opt.tasks,
+                       max_before_nms=opt.max_before_nms, max_after_nms=opt.max_after_nms)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+if __name__ == '__main__':
+    start_inference()

baselines/clip_alignment_with_language/local_utils/__init__.py ADDED Viewed

File without changes

baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (217 Bytes). View file

baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc ADDED Viewed

Binary file (8.16 kB). View file

baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc ADDED Viewed

Binary file (7.9 kB). View file

baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Compute oracle upper bound for a given proposal method, which acts like
+a reversed recall, where we recall the GT timestamp pairs in the set of
+generated proposals.
+"""
+import pprint
+import numpy as np
+from tqdm import tqdm
+from collections import Counter
+from utils.basic_utils import load_jsonl, save_json
+from standalone_eval.eval import compute_temporal_iou_batch
+from baselines.clip_alignment_with_language.local_utils.proposal import get_proposal_interface, ProposalConfigs
+def get_didemo_agreed_ts(times_list):
+    """
+    input example: [[1, 1], [1, 1], [1, 1], [0, 0]],
+    return: [1, 1]"""
+    times_str_list = [tuple(e) for e in times_list]
+    times_str_list_counter = Counter(times_str_list)
+    most_frequent_times = times_str_list_counter.most_common(1)[0][0]
+    return most_frequent_times
+def get_proposals_for_single_desc_video_pair(single_data, proposal_fn, dset_name):
+    proposal_info = dict(
+        vid_name=single_data["vid_name"],
+        desc_id=single_data["desc_id"],
+        gt_ts=single_data["ts"] if dset_name != "didemo" else get_didemo_agreed_ts(single_data["ts"]),
+        proposals=proposal_fn(video_id="", metadata={"duration": single_data["duration"]}),
+    )
+    proposal_info["proposal_ious"] = compute_temporal_iou_batch(
+        proposal_info["proposals"], proposal_info["gt_ts"])
+    return proposal_info
+def get_proposals_for_videos(datalist, dset_name):
+    """datalist list(dict): each dict is
+    {"desc_id": str/int, "duration": float, "ts": [st (float), ed (float)], ...}
+    Note for Didemo dataset, "ts" entry is a list of [st (float), ed (float)] from different annotators,
+    here we use the most frequent ts, we break ties by randomly sample one
+    """
+    proposal_interface = get_proposal_interface(dset_name)
+    video_proposals_list = []
+    for e in tqdm(datalist, desc="Computing video proposals"):
+        video_proposals_list.append(
+            get_proposals_for_single_desc_video_pair(e, proposal_interface, dset_name))
+    return video_proposals_list
+def is_recalled_single_moment(proposal_ious, iou_thds=(0.5, 0.7)):
+    """
+    Args:
+        proposal_ious: np.ndarray, shape (N_proposal, )
+        iou_thds: set, temporal IoU thresholds
+    Returns:
+        list(bool), len == len(iou_thds), indicates whether recall under a iou_thd is found.
+    """
+    recalled = [False, ] * len(iou_thds)
+    for idx, iou_thd in enumerate(iou_thds):
+        recalled[idx] = np.sum(proposal_ious >= iou_thd) >= 1  # at least one
+    return recalled
+def compute_proposal_recall_upper_bound(video_proposals_list, iou_thds=(0.5, 0.7)):
+    """video_proposals_list from get_proposals_for_videos()"""
+    iou_corrects = np.empty((len(video_proposals_list), 2), dtype=np.float32)
+    for idx, d in tqdm(enumerate(video_proposals_list),
+                       desc="Computing recall for videos",
+                       total=len(video_proposals_list)):
+        iou_corrects[idx] = is_recalled_single_moment(d["proposal_ious"],
+                                                      iou_thds=iou_thds)
+    recall_by_iou = {iou_thd: float(np.mean(iou_corrects[:, idx]))
+                     for idx, iou_thd in enumerate(iou_thds)}
+    return recall_by_iou
+def main_compute_upper_bound():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-dset_name", type=str, choices=["tvr"])
+    parser.add_argument("-eval_file_path", type=str, help="path to the file containing data to be evaluated")
+    parser.add_argument("-save_path", type=str, help="path to save the results")
+    parser.add_argument("-verbose", action="store_true")
+    args = parser.parse_args()
+    eval_datalist = load_jsonl(args.eval_file_path)
+    video_proposals_list = get_proposals_for_videos(eval_datalist, args.dset_name)
+    recall_metrics = compute_proposal_recall_upper_bound(video_proposals_list, iou_thds=(0.5, 0.7))
+    video_proposals_list_by_video = {}
+    for p in video_proposals_list:
+        if p["vid_name"] in video_proposals_list_by_video:
+            continue
+        else:
+            video_proposals_list_by_video[p["vid_name"]] = p
+    video_proposals_list_by_video = list(video_proposals_list_by_video.values())
+    total_n_clips_in_proposals = \
+        np.sum([np.sum(e["proposals"][:, 1] - e["proposals"][:, 0]) for e in video_proposals_list_by_video])
+    results = dict(
+        avg_num_proposals=float(np.mean([len(e["proposals"]) for e in video_proposals_list_by_video])),
+        total_num_proposals=int(np.sum([len(e["proposals"]) for e in video_proposals_list_by_video])),
+        recall_metrics=recall_metrics,
+        dset_name=args.dset_name,
+        filename=args.eval_file_path,
+        proposal_config=ProposalConfigs[args.dset_name]
+    )
+    results["avg_clip_per_proposal"] = total_n_clips_in_proposals / results["total_num_proposals"]
+    save_json(results, args.save_path, save_pretty=True)
+    if args.verbose:
+        pprint.pprint(results)
+if __name__ == '__main__':
+    main_compute_upper_bound()

baselines/clip_alignment_with_language/local_utils/proposal.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# MIT License
+#
+# Copyright (c) 2018 Victor Escorcia Castillo
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ==============================================================================
+"""
+Group multiple methods to generate salient temporal windows in a video"""
+import itertools
+import numpy as np
+PROPOSAL_SCHEMES = ['DidemoICCV17SS', 'SlidingWindowMSRSS']
+class TemporalProposalsBase:
+    """Base class (signature) to generate temporal candidate in video"""
+    def __call__(self, video_id, metadata=None, feature_collection=None):
+        raise NotImplementedError('Implement with the signature above')
+class DidemoICCV17SS(TemporalProposalsBase):
+    """Original search space of moments proposed in ICCV-2017
+    Attributes:
+        clip_length_min (float) : minimum length, in seconds, of a video clip.
+        proposals (numpy array) : of shape [21, 2] representing all the
+            possible temporal segments of valid annotations of DiDeMo dataset.
+            It represents the search space of a temporal localization
+            algorithm.
+    Reference: Hendricks et al. Localizing Moments in Video with Natural
+        Language. ICCV 2017.
+    """
+    clip_length_min = 5.0
+    def __init__(self, *args, dtype=np.float32, **kwargs):
+        clips_indices = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
+        for i in itertools.combinations(range(len(clips_indices)), 2):
+            clips_indices.append(i)
+        self.proposals = np.array(clips_indices, dtype=dtype)
+        self.proposals *= self.clip_length_min
+        self.proposals[:, 1] += self.clip_length_min
+    def __call__(self, *args, **kwargs):
+        return self.proposals
+class SlidingWindowMSRSS(TemporalProposalsBase):
+    """Multi-scale sliding window with relative stride within the same scale
+    Attributes:
+        length (float) : length of smallest window.
+        scales (sequence of int) : duration of moments relative to
+            `length`.
+        stride (float) : relative stride between two windows with the same
+            duration. We used different strides for each scale rounding it
+            towards a multiple of `length`. Note that the minimum stride is
+            `length` for any window will be the `length` itself.
+        dtype (numpy.dtype) :
+    """
+    def __init__(self, length, scales, stride=0.5, round_base=0.5, dtype=np.float32):
+        self.length = length
+        self.scales = scales
+        self.round_base = round_base
+        self.relative_stride = stride
+        # pick strides per scale that are multiples of length
+        self.strides = [max(round(s * stride / round_base) * round_base, round_base)
+                        * length for s in scales]
+        self.dtype = dtype
+        assert len(scales) > 0
+    def sliding_windows(self, t_end, t_start=0):
+        """sliding canonical windows over a given time interval"""
+        windows_ = []
+        for i, stride in enumerate(self.strides):
+            num_i = np.ceil((t_end - t_start) / stride)
+            windows_i = np.empty((int(num_i), 2), dtype=np.float32)
+            windows_i[:, 0] = np.arange(t_start, t_end, stride)
+            windows_i[:, 1] = windows_i[:, 0] + self.length * self.scales[i]
+            windows_i[windows_i[:, 1] > t_end, 1] = t_end
+            windows_.append(windows_i)
+            # print("--------------------------------{}".format(i))
+            # print(windows_i)
+        # import sys
+        # sys.exit(1)
+        windows = np.concatenate(windows_, axis=0)
+        # Hacky way to make windows fit inside video
+        # It implies windows at the end may not belong to the set spanned by
+        # length and scales.
+        return np.unique(windows, axis=0)
+    def __call__(self, video_id, metadata=None, feature_collection=None):
+        """return: (N_window, 2), each row contains (start, end)"""
+        duration = metadata.get('duration')
+        assert duration is not None
+        return self.sliding_windows(duration)
+ProposalConfigs = {
+    "didemo": {
+        "proposal_interface": "DidemoICCV17SS",
+        "clip_length": 2.5,
+    },
+    "tvr": {
+        "length": 3,  # min proposal length
+        "scales": [1, 2, 4, 8],
+        "stride": 0.3,
+        "round_base": 1,
+        "min_proposal_length": 3,  # length * min(scales)
+        "clip_length": 1.5,  # length should be divisible by clip_length
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+    "anet_cap": {
+        "length": 5,
+        "scales": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26],
+        "stride": 0.3,
+        "round_base": 1,
+        "min_proposal_length": 10,  # length * min(scales)
+        "clip_length": 5,  # length * min(scales) / 2
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+    "charades_sta": {
+        "length": 3,
+        "scales": [2, 3, 4, 5, 6, 7, 8],
+        "stride": 0.3,
+        "round_base": 1,
+        "min_proposal_length": 6,  # length * min(scales)
+        "clip_length": 3,  # length * min(scales) / 2
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+    "profiling": {
+        "length": 5,
+        "scales": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+        "stride": 0.3,
+        "round_base": 1,
+        "clip_length": 5,  # length * min(scales) / 2
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+}
+"""
+'min_clip_length' is used to uniformly segment the video into smaller clips, it is a half of
+the 'min_proposal_length'. Thus we can enforce each moment has at least 2 clips.
+"""
+def get_proposal_interface(dset_name):
+    """ dset_name (str): one of ["tvr"] """
+    assert dset_name in ProposalConfigs
+    if dset_name == "didemo":
+        return DidemoICCV17SS()
+    else:
+        arg_names = ["length", "scales", "stride", "round_base"]
+        func_args = {k: ProposalConfigs[dset_name][k] for k in arg_names}
+        return SlidingWindowMSRSS(**func_args)
+if __name__ == '__main__':
+    test_fns_args = [(DidemoICCV17SS, (),),
+                     (SlidingWindowMSRSS, (1.5, [2, 4, 6, 12]))]
+    for fn_i, args_i in test_fns_args:
+        proposal_fn = fn_i(*args_i)
+        x = proposal_fn('hola', {'duration': 15})
+        if fn_i == DidemoICCV17SS:
+            assert len(x) == 21

baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+{'avg_num_proposals': 158.30197338228544,
+ 'dset_name': 'tvr',
+ 'filename': 'data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 1,
+                     'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.8927030563354492, 0.7: 0.6690225005149841},
+ 'total_num_proposals': 344940}
+{'avg_num_proposals': 213.3295089490592,
+ 'dset_name': 'tvr',
+ 'filename': 'data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'min_clip_length': 1.5,
+                     'min_proposal_length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9612666368484497, 0.7: 0.8215695023536682},
+ 'total_num_proposals': 464845}
+ --
+{'avg_num_proposals': 213.3295089490592,
+ 'dset_name': 'tvr',
+ 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9612666368484497, 0.7: 0.8215695023536682}}
+{'avg_num_proposals': 263.3845800826067,
+ 'dset_name': 'tvr',
+ 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9841211438179016, 0.7: 0.8567232489585876}}
+{'avg_num_proposals': 242.97246443322626,
+ 'dset_name': 'tvr',
+ 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [0.5, 1, 2, 3, 4, 5, 6, 7, 8],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9608076810836792, 0.7: 0.8212941884994507}}
+"""

baselines/clip_alignment_with_language/mix_model_prediction.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Implement the CAL + CAL (TEF) model mentioned in
+```
+@article{Escorcia2019TemporalLO,
+  title={Temporal Localization of Moments in Video Collections with Natural Language},
+  author={Victor Escorcia and Mattia Soldan and Josef Sivic and Bernard Ghanem and Bryan Russell},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1907.12763}
+}
+```
+Methods:
+    1, Give top200 predictions for each query in CAL then using CAL (TEF) to re-rank.
+    2, This is approximated by re-ranking the top200 CAL using top1000 CAL(TEF) -- we assume they will be all covered.
+"""
+import torch
+import subprocess
+import numpy as np
+from tqdm import tqdm
+from utils.basic_utils import load_json, save_json
+def load_saved_res(pred_path):
+    if pred_path.endswith(".json"):
+        pred = load_json(pred_path)
+    else:
+        pred = torch.load(pred_path)
+    vcmr_res = {e["desc_id"]: e for e in pred["VCMR"]}
+    video2idx = pred["video2idx"]
+    return vcmr_res, video2idx
+def main_mix_results(pred_path, tef_pred_path, save_path, max_after_nms=100):
+    """
+    Args:
+        pred_path: contains top-200 VCMR predictions
+        tef_pred_path: contains top-1000 VCMR predictions
+        save_path:
+        max_after_nms: int,
+    Returns:
+        save
+    """
+    vcmr_res, video2idx = load_saved_res(pred_path)
+    tef_vcmr_res, video2idx = load_saved_res(tef_pred_path)
+    reranked_vcmr_res = {}
+    num_valid = []
+    for desc_id, preds in tqdm(vcmr_res.items(), desc="Loop over the predictions"):
+        tef_preds = tef_vcmr_res[desc_id]["predictions"]
+        pred_moments = set([tuple(e[:3]) for e in preds["predictions"]])
+        reranked_moments = [e for e in tef_preds if tuple(e[:3]) in pred_moments][:max_after_nms]
+        num_valid += [len(reranked_moments)]
+        if len(reranked_moments) != 100:
+            reranked_moments += reranked_moments[:100 - len(reranked_moments)]
+        reranked_vcmr_res[desc_id] = dict(
+            predictions=reranked_moments,
+            desc_id=desc_id,
+            desc=preds["desc"]
+        )
+    print("There are {} moments founded on average".format(np.mean(num_valid)))
+    reranked_predictions = dict(
+        VCMR=list(reranked_vcmr_res.values()),
+        video2idx=video2idx
+    )
+    save_json(reranked_predictions, save_path)
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_path", type=str, help="path to prediction res")
+    parser.add_argument("--tef_pred_path", type=str, help="path to TEF prediction res")
+    parser.add_argument("--save_path", type=str, help="path to save the re-ranked predictions, same dir as --pred_path")
+    parser.add_argument("--gt_path", type=str, help="path to ground truth file")
+    args = parser.parse_args()
+    main_mix_results(args.pred_path, args.tef_pred_path, args.save_path)
+    metrics_path = args.save_path.replace(".json", "_metrics.json")
+    eval_cmd = "python standalone_eval/eval.py --submission_path " + args.save_path + " --gt_path " + args.gt_path + \
+        " --save_path " + metrics_path
+    results = subprocess.run(eval_cmd, shell=True)

baselines/clip_alignment_with_language/model.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.model_utils import RNNEncoder
+from easydict import EasyDict as edict
+cal_base_cfg = edict(
+    visual_input_size=2048,  # changes based on visual input type
+    textual_input_size=768,
+    query_feat_size=768,
+    visual_hidden_size=500,  #
+    output_size=100,
+    embedding_size=768,
+    lstm_hidden_size=1000,
+    margin=0.1,  # margin for ranking loss
+    loss_type="hinge",  # loss type, 'hinge' or 'lse'
+    inter_loss_weight=0.4,  # weight for inter negatives
+    ctx_mode="video"
+)
+class CAL(nn.Module):
+    def __init__(self, config):
+        super(CAL, self).__init__()
+        self.config = config
+        self.moment_mlp = nn.Sequential(
+            nn.Linear(config.visual_input_size, config.visual_hidden_size),
+            nn.ReLU(True),
+            nn.Linear(config.visual_hidden_size, config.output_size),
+        )
+        self.query_lstm = RNNEncoder(word_embedding_size=config.embedding_size,
+                                     hidden_size=config.lstm_hidden_size,
+                                     bidirectional=False,
+                                     rnn_type="lstm",
+                                     dropout_p=0,
+                                     n_layers=1,
+                                     return_outputs=False)
+        self.query_linear = nn.Linear(config.lstm_hidden_size, config.output_size)
+    def moment_encoder(self, moment_feat):
+        """moment_feat: (N, L_clip, D_v)"""
+        return F.normalize(self.moment_mlp(moment_feat), p=2, dim=-1)  # (N, L_clip, D_o)
+    def query_encoder(self, query_feat, query_mask):
+        """
+        Args:
+            query_feat: (N, L_q, D_q), torch.float32
+            query_mask: (N, L_q), torch.float32, with 1 indicates valid query, 0 indicates mask
+        """
+        _, hidden = self.query_lstm(query_feat, torch.sum(query_mask, dim=1).long())
+        return F.normalize(self.query_linear(hidden), p=2, dim=-1)  # (N, D_o)
+    def compute_pdist(self, query_embedding, moment_feat, moment_mask):
+        """ pairwise L2 distance
+        Args:
+            query_embedding: (N, D_o)
+            moment_feat: (N, L_clip, D_v)
+            moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding
+        """
+        moment_embedding = self.moment_encoder(moment_feat)  # (N, L_clip, D_o)
+        moment_clip_dist = torch.sum((moment_embedding - query_embedding.unsqueeze(1)) ** 2, dim=2)  # (N, L_clip)
+        moment_dist = torch.sum(moment_clip_dist * moment_mask, dim=1) / moment_mask.sum(1)  # (N, )
+        return moment_dist  # (N, )
+    @classmethod
+    def compute_cdist_inference(cls, query_embeddings, moment_embeddings, moment_mask):
+        """ Compute L2 distance for every possible pair of queries and proposals. This is different from
+        compute_pdist as the latter computes only pairs at each row.
+        Args:
+            query_embeddings: (N_q, D_o)
+            moment_embeddings: (N_prop, N_clips, D_o)
+            moment_mask: (N_prop, N_clips)
+        return:
+            query_moment_scores: (N_q, N_prop)
+        """
+        # sync device
+        query_device = query_embeddings.device  # convert to cuda if we want to use GPU
+        if moment_embeddings.device != query_device:
+            moment_embeddings = moment_embeddings.to(query_device)
+            moment_mask = moment_mask.to(query_device)
+        # compute
+        n_query = query_embeddings.shape[0]
+        n_prop, n_clips, d = moment_embeddings.shape
+        query_clip_dist = torch.cdist(
+            query_embeddings, moment_embeddings.reshape(-1, d), p=2) ** 2  # (N_q, N_prop * N_clips)
+        query_clip_dist = query_clip_dist.reshape(n_query, n_prop, n_clips)
+        query_moment_dist = torch.sum(
+            query_clip_dist * moment_mask.unsqueeze(0), dim=2) / moment_mask.sum(1).unsqueeze(0)
+        return query_moment_dist  # (N_q, N_prop)
+    def forward(self, query_feat, query_mask, pos_moment_feat, pos_moment_mask,
+                intra_neg_moment_feat, intra_neg_moment_mask,
+                inter_neg_moment_feat, inter_neg_moment_mask):
+        """
+        Args:
+            query_feat: (N, L, D_q)
+            query_mask: (N, L)
+            pos_moment_feat: (N, L_clip_1, D_v)
+            pos_moment_mask: (N, L_clip_1)
+            intra_neg_moment_feat: (N, L_clip_2, D_v)
+            intra_neg_moment_mask: (N, L_clip_2)
+            inter_neg_moment_feat: (N, L_clip_3, D_v)
+            inter_neg_moment_mask: (N, L_clip_2)
+        """
+        query_embed = self.query_encoder(query_feat, query_mask)  # (N, D_o)
+        pos_dist = self.compute_pdist(query_embed, pos_moment_feat, pos_moment_mask)  # (N, )
+        intra_neg_dist = self.compute_pdist(query_embed, intra_neg_moment_feat, intra_neg_moment_mask)  # (N, )
+        if self.config.inter_loss_weight == 0:  # should be zero for tef_only method.
+            loss_inter = 0.
+        else:
+            inter_neg_dist = self.compute_pdist(query_embed, inter_neg_moment_feat, inter_neg_moment_mask)  # (N, )
+            loss_inter = self.calc_loss(pos_dist, inter_neg_dist)
+        loss = self.calc_loss(pos_dist, intra_neg_dist) + self.config.inter_loss_weight * loss_inter
+        return loss
+    def calc_loss(self, pos_dist, neg_dist):
+        """ Note here we encourage positive distance to be smaller than negative distance.
+        Args:
+            pos_dist: (N, ), torch.float32
+            neg_dist: (N, ), torch.float32
+        """
+        if self.config.loss_type == "hinge":  # max(0, m + S_pos - S_neg)
+            return torch.clamp(self.config.margin + pos_dist - neg_dist, min=0).sum() / len(pos_dist)
+        elif self.config.loss_type == "lse":  # log[1 + exp(S_pos - S_neg)]
+            return torch.log1p(torch.exp(pos_dist - neg_dist)).sum() / len(pos_dist)
+        else:
+            raise NotImplementedError("Only support 'hinge' and 'lse'")
+class CALWithSub(nn.Module):
+    def __init__(self, config):
+        super(CALWithSub, self).__init__()
+        self.config = config
+        self.use_video = "video" in config.ctx_mode
+        self.use_sub = "sub" in config.ctx_mode
+        self.use_tef = "tef" in config.ctx_mode
+        self.tef_only = self.use_tef and not self.use_video and not self.use_sub
+        if self.use_video or self.tef_only:
+            self.video_moment_mlp = nn.Sequential(
+                nn.Linear(config.visual_input_size, config.visual_hidden_size),
+                nn.ReLU(True),
+                nn.Linear(config.visual_hidden_size, config.output_size),
+            )
+        if self.use_sub:
+            self.sub_moment_mlp = nn.Sequential(
+                nn.Linear(config.textual_input_size, config.visual_hidden_size),
+                nn.ReLU(True),
+                nn.Linear(config.visual_hidden_size, config.output_size),
+            )
+        self.query_lstm = RNNEncoder(word_embedding_size=config.query_feat_size,
+                                     hidden_size=config.lstm_hidden_size,
+                                     bidirectional=False,
+                                     rnn_type="lstm",
+                                     dropout_p=0,
+                                     n_layers=1,
+                                     return_outputs=False)
+        self.query_linear = nn.Linear(config.lstm_hidden_size, config.output_size)
+    def moment_encoder(self, moment_feat, module_name="video"):
+        """moment_feat: (N, L_clip, D_v)"""
+        if moment_feat is not None:
+            encoder = getattr(self, module_name + "_moment_mlp")
+            return F.normalize(encoder(moment_feat), p=2, dim=-1)  # (N, L_clip, D_o)
+        else:
+            return None
+    def query_encoder(self, query_feat, query_mask):
+        """
+        Args:
+            query_feat: (N, L_q, D_q), torch.float32
+            query_mask: (N, L_q), torch.float32, with 1 indicates valid query, 0 indicates mask
+        """
+        _, hidden = self.query_lstm(query_feat, torch.sum(query_mask, dim=1).long())
+        return F.normalize(self.query_linear(hidden), p=2, dim=-1)  # (N, D_o)
+    def _compute_pdist(self, query_embedding, moment_feat, moment_mask, module_name="video"):
+        """ pairwise L2 distance
+        Args:
+            query_embedding: (N, D_o)
+            moment_feat: (N, L_clip, D_v)
+            moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding
+        """
+        moment_embedding = self.moment_encoder(moment_feat, module_name=module_name)  # (N, L_clip, D_o)
+        moment_clip_dist = torch.sum((moment_embedding - query_embedding.unsqueeze(1)) ** 2, dim=2)  # (N, L_clip)
+        moment_dist = torch.sum(moment_clip_dist * moment_mask, dim=1) / moment_mask.sum(1)  # (N, )
+        return moment_dist  # (N, )
+    def compute_pdist(self, query_embedding, moment_video_feat, moment_sub_feat, moment_mask):
+        """ pairwise L2 distance
+        Args:
+            query_embedding: (N, D_o)
+            moment_video_feat: (N, L_clip, D_v)
+            moment_sub_feat: (N, L_clip, D_t)
+            moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding
+        """
+        divisor = (self.use_video or self.tef_only) + self.use_sub
+        video_moment_dist = self._compute_pdist(query_embedding, moment_video_feat, moment_mask, module_name="video") \
+            if self.use_video or self.tef_only else 0
+        sub_moment_dist = self._compute_pdist(query_embedding, moment_sub_feat, moment_mask, module_name="sub") \
+            if self.use_sub else 0
+        return (video_moment_dist + sub_moment_dist) / divisor  # (N, )
+    def _compute_cdist_inference(self, query_embeddings, moment_embeddings, moment_mask):
+        """ Compute L2 distance for every possible pair of queries and proposals. This is different from
+        compute_pdist as the latter computes only pairs at each row.
+        Args:
+            query_embeddings: (N_q, D_o)
+            moment_embeddings: (N_prop, N_clips, D_o)
+            moment_mask: (N_prop, N_clips)
+        return:
+            query_moment_scores: (N_q, N_prop)
+        """
+        # sync device
+        query_device = query_embeddings.device  # convert to cuda if we want to use GPU
+        if moment_embeddings.device != query_device:
+            moment_embeddings = moment_embeddings.to(query_device)
+            moment_mask = moment_mask.to(query_device)
+        # compute
+        n_query = query_embeddings.shape[0]
+        n_prop, n_clips, d = moment_embeddings.shape
+        query_clip_dist = torch.cdist(
+            query_embeddings, moment_embeddings.reshape(-1, d), p=2) ** 2  # (N_q, N_prop * N_clips)
+        query_clip_dist = query_clip_dist.reshape(n_query, n_prop, n_clips)
+        query_moment_dist = torch.sum(
+            query_clip_dist * moment_mask.unsqueeze(0), dim=2) / moment_mask.sum(1).unsqueeze(0)
+        return query_moment_dist  # (N_q, N_prop)
+    def compute_cdist_inference(self, query_embeddings, video_moment_embeddings, sub_moment_embeddings, moment_mask):
+        divisor = (self.use_video or self.tef_only) + self.use_sub
+        video_moment_dist = self._compute_cdist_inference(query_embeddings, video_moment_embeddings, moment_mask) \
+            if self.use_video or self.tef_only else 0
+        sub_moment_dist = self._compute_cdist_inference(query_embeddings, sub_moment_embeddings, moment_mask) \
+            if self.use_sub else 0
+        return (video_moment_dist + sub_moment_dist) / divisor  # (N_q, N_prop)
+    def forward(self, query_feat, query_mask, pos_moment_video_feat, pos_moment_video_mask,
+                intra_neg_moment_video_feat, intra_neg_moment_video_mask,
+                inter_neg_moment_video_feat, inter_neg_moment_video_mask,
+                pos_moment_sub_feat, pos_moment_sub_mask,
+                intra_neg_moment_sub_feat, intra_neg_moment_sub_mask,
+                inter_neg_moment_sub_feat, inter_neg_moment_sub_mask):
+        """
+        Args:
+            query_feat: (N, L, D_q)
+            query_mask: (N, L)
+            pos_moment_video_feat: (N, L_clip_1, D_v)
+            pos_moment_video_mask: (N, L_clip_1)
+            intra_neg_moment_video_feat: (N, L_clip_2, D_v)
+            intra_neg_moment_video_mask: (N, L_clip_2)
+            inter_neg_moment_video_feat: (N, L_clip_3, D_v)
+            inter_neg_moment_video_mask: (N, L_clip_2)
+            pos_moment_sub_feat:
+            pos_moment_sub_mask:
+            intra_neg_moment_sub_feat:
+            intra_neg_moment_sub_mask:
+            inter_neg_moment_sub_feat:
+            inter_neg_moment_sub_mask:
+        """
+        query_embed = self.query_encoder(query_feat, query_mask)  # (N, D_o)
+        pos_dist = self.compute_pdist(
+            query_embed, pos_moment_video_feat, pos_moment_sub_feat,
+            moment_mask=pos_moment_sub_mask if self.use_sub else pos_moment_video_mask)  # (N, )
+        intra_neg_dist = self.compute_pdist(
+            query_embed, intra_neg_moment_video_feat, intra_neg_moment_sub_feat,
+            moment_mask=intra_neg_moment_sub_mask if self.use_sub else intra_neg_moment_video_mask)  # (N, )
+        if self.config.inter_loss_weight == 0:  # should be zero for tef_only method.
+            loss_inter = 0.
+        else:
+            inter_neg_dist = self.compute_pdist(
+                query_embed, inter_neg_moment_video_feat, inter_neg_moment_sub_feat,
+                moment_mask=inter_neg_moment_sub_mask if self.use_sub else inter_neg_moment_video_mask)  # (N, )
+            loss_inter = self.calc_loss(pos_dist, inter_neg_dist)
+        loss = self.calc_loss(pos_dist, intra_neg_dist) + self.config.inter_loss_weight * loss_inter
+        return loss
+    def calc_loss(self, pos_dist, neg_dist):
+        """ Note here we encourage positive distance to be smaller than negative distance.
+        Args:
+            pos_dist: (N, ), torch.float32
+            neg_dist: (N, ), torch.float32
+        """
+        if self.config.loss_type == "hinge":  # max(0, m + S_pos - S_neg)
+            return torch.clamp(self.config.margin + pos_dist - neg_dist, min=0).sum() / len(pos_dist)
+        elif self.config.loss_type == "lse":  # log[1 + exp(S_pos - S_neg)]
+            return torch.log1p(torch.exp(pos_dist - neg_dist)).sum() / len(pos_dist)
+        else:
+            raise NotImplementedError("Only support 'hinge' and 'lse'")

baselines/clip_alignment_with_language/proposal_retrieval_dataset.py ADDED Viewed

	@@ -0,0 +1,587 @@

+"""
+Dataset for clip model
+"""
+import logging
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import h5py
+import math
+import random
+from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array
+from utils.tensor_utils import pad_sequences_1d
+from baselines.clip_alignment_with_language.local_utils.proposal import get_proposal_interface
+from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \
+    get_didemo_agreed_ts
+from standalone_eval.eval import compute_temporal_iou_batch
+logger = logging.getLogger(__name__)
+class ProposalRetrievalDataset(Dataset):
+    """
+    Args:
+        dset_name, str, ["tvr"]
+        ctx_mode: str,
+        pos_iou_thd: float, in [0, 1], >= pos_iou_thd are defined as positive
+        neg_iou_thd: float, in [0, 1], < neg_iou_thd are defined as negative
+    Return:
+        a dict: {
+            "meta": {
+                "desc_id": int,
+                "desc": str,
+                "vid_name": str,
+                "duration": float,
+                "ts": [st (float), ed (float)], seconds, ground_truth timestamps
+                "pos_moment": [st (float), ed (float)], seconds, IoU with "ts" >= pos_iou_thd
+                "intra_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd
+                "inter_neg_vid_name": str,
+                "inter_neg_duration": float,
+                "inter_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd
+            }
+            "model_inputs": {
+                "desc_feat": torch.tensor, (L, D_t)
+                "pos_moment_feat": torch.tensor, (n_clip_in_moment, D)
+                "intra_neg_moment_feat": torch.tensor, (n_clip_in_moment, D)
+                "inter_neg_moment_feat": torch.tensor, (n_clip_in_moment, D)
+            }
+        }
+    """
+    def __init__(self, dset_name, data_path, desc_bert_path, sub_bert_path, max_desc_len,
+                 vid_feat_path, clip_length, vid_feat_size, sub_feat_size=0, ctx_mode="video_tef",
+                 pos_iou_thd=0.7, neg_iou_thd=0.3, h5driver=None, data_ratio=1.0,
+                 normalize_vfeat=True, normalize_tfeat=True, model_type="cal",
+                 external_train_vr_res_path=None, corpus_path=None):
+        self.dset_name = dset_name
+        self.model_type = model_type
+        self.pool_local = model_type == "mcn"  # pool local feature
+        self.data_path = data_path
+        self.data_ratio = data_ratio
+        self.desc_bert_path = desc_bert_path
+        self.max_desc_len = max_desc_len
+        self.sub_bert_path = sub_bert_path
+        self.vid_feat_path = vid_feat_path
+        self.clip_length = clip_length
+        self.ctx_mode = ctx_mode
+        self.pos_iou_thd = pos_iou_thd
+        self.neg_iou_thd = neg_iou_thd
+        self.vid_feat_output_size = 2 * vid_feat_size * ("video" in ctx_mode) + 2 * ("tef" in ctx_mode)
+        self.sub_feat_output_size = 2 * sub_feat_size * ("sub" in ctx_mode) + 2 * ("tef" in ctx_mode)
+        # prepare desc data
+        self.data = load_jsonl(data_path)
+        if self.data_ratio != 1:
+            n_examples = int(len(self.data) * data_ratio)
+            self.data = self.data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+        self.proposal_fn = get_proposal_interface(dset_name)
+        if self.ctx_mode != "tef":
+            self.vid_feat_h5 = h5py.File(self.vid_feat_path, "r", driver=h5driver)
+        self.desc_bert_h5 = h5py.File(self.desc_bert_path, "r", driver=h5driver)
+        if "sub" in self.ctx_mode:
+            self.sub_bert_h5 = h5py.File(self.sub_bert_path, "r", driver=h5driver)
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+        if external_train_vr_res_path is not None:
+            video_data = load_json(corpus_path)["train"]
+            # {video_idx: [vid_name, vid_duration]}
+            video_idx2name_dur_pair = {v[1]: [k, v[0]] for k, v in video_data.items()}
+            external_vr_res = load_json(external_train_vr_res_path)
+            # {desc_id: [(vid_name, vid_duration), ...]}
+            self.desc_id2video_names_dur_pairs = \
+                {e["desc_id"]: [video_idx2name_dur_pair[int(sub_e[0])] for sub_e in e["predictions"]]
+                 for e in external_vr_res["VR"]}  # ordered
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        raw_data = self.data[index]
+        # initialize with basic data
+        meta = dict(
+            desc_id=raw_data["desc_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+            ts=raw_data["ts"] if self.dset_name != "didemo" else get_didemo_agreed_ts(raw_data["ts"]),
+        )
+        model_inputs = dict()
+        query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        model_inputs["query_feat"] = torch.from_numpy(query_feat)
+        # sample positive and negative moments
+        meta["pos_moment"] = self.align_ts_to_clip_boundaries(meta["duration"], meta["ts"])
+        meta["intra_neg_moment"] = self.sample_intra_neg_moment(meta["duration"], meta["ts"])
+        meta["inter_neg_moment"], meta["inter_neg_vid_name"], meta["inter_neg_duration"] = \
+            self.sample_inter_video_negative(meta["vid_name"], meta["pos_moment"] / meta["duration"],
+                                             desc_id=meta["desc_id"])
+        pos_tef, intra_neg_tef, inter_neg_tef = (None,) * 3
+        if self.use_tef:
+            pos_tef = meta["pos_moment"] / meta["duration"]  # temporal endpoint feature, (2, )
+            intra_neg_tef = meta["intra_neg_moment"] / meta["duration"]
+            inter_neg_tef = meta["inter_neg_moment"] / meta["inter_neg_duration"]
+        if self.use_video:
+            pos_v_feat = self.vid_feat_h5[meta["vid_name"]]  # (N_frm, D)
+            neg_v_feat = self.vid_feat_h5[meta["inter_neg_vid_name"]]
+            pos_v_ctx_feat = np.mean(pos_v_feat, axis=0)
+            neg_v_ctx_feat = np.mean(neg_v_feat, axis=0)
+            if self.normalize_vfeat:
+                pos_v_ctx_feat = l2_normalize_np_array(pos_v_ctx_feat)
+                neg_v_ctx_feat = l2_normalize_np_array(neg_v_ctx_feat)
+            pos_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["pos_moment"],
+                                                     normalize=self.normalize_vfeat,
+                                                     fix_outbound=True, pool_local=self.pool_local)
+            intra_neg_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["intra_neg_moment"],
+                                                           normalize=self.normalize_vfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+            inter_neg_moment_v_feat = self.get_moment_feat(neg_v_feat, meta["inter_neg_moment"],
+                                                           normalize=self.normalize_vfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+            # concat features, [video_clip_feat; video_context_feat; temporal_endpoint_feat]
+            model_inputs["pos_moment_video_feat"] = self.concat_feat_adv(
+                moment_feats=[pos_moment_v_feat, pos_v_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode)
+            model_inputs["intra_neg_moment_video_feat"] = self.concat_feat_adv(
+                moment_feats=[intra_neg_moment_v_feat, pos_v_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode)
+            model_inputs["inter_neg_moment_video_feat"] = self.concat_feat_adv(
+                moment_feats=[inter_neg_moment_v_feat, neg_v_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode)
+        else:
+            for k in ["pos_moment_video_feat", "intra_neg_moment_video_feat", "inter_neg_moment_video_feat"]:
+                model_inputs[k] = torch.zeros((2, 2))
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            pos_s_feat = self.sub_bert_h5[meta["vid_name"]]  # (N_words, D_t)
+            neg_s_feat = self.sub_bert_h5[meta["inter_neg_vid_name"]]
+            pos_s_ctx_feat = np.mean(pos_s_feat, axis=0)
+            neg_s_ctx_feat = np.mean(neg_s_feat, axis=0)
+            if self.normalize_tfeat:
+                pos_s_ctx_feat = l2_normalize_np_array(pos_s_ctx_feat)
+                neg_s_ctx_feat = l2_normalize_np_array(neg_s_ctx_feat)
+            pos_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["pos_moment"],
+                                                     normalize=self.normalize_tfeat,
+                                                     fix_outbound=True, pool_local=self.pool_local)
+            intra_neg_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["intra_neg_moment"],
+                                                           normalize=self.normalize_tfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+            inter_neg_moment_s_feat = self.get_moment_feat(neg_s_feat, meta["inter_neg_moment"],
+                                                           normalize=self.normalize_tfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+            # concat features, [sub_clip_feat; sub_context_feat; temporal_endpoint_feat]
+            model_inputs["pos_moment_sub_feat"] = self.concat_feat_adv(
+                moment_feats=[pos_moment_s_feat, pos_s_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode)
+            model_inputs["intra_neg_moment_sub_feat"] = self.concat_feat_adv(
+                moment_feats=[intra_neg_moment_s_feat, pos_s_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode)
+            model_inputs["inter_neg_moment_sub_feat"] = self.concat_feat_adv(
+                moment_feats=[inter_neg_moment_s_feat, neg_s_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode)
+        else:
+            for k in ["pos_moment_sub_feat", "intra_neg_moment_sub_feat", "inter_neg_moment_sub_feat"]:
+                model_inputs[k] = torch.zeros((2, 2))
+        if not self.use_sub and not self.use_video and self.use_tef:  # use video stream
+            model_inputs["pos_moment_video_feat"] = \
+                self.concat_feat_adv(tef=pos_tef, ctx_mode=self.ctx_mode)
+            model_inputs["intra_neg_moment_video_feat"] = \
+                self.concat_feat_adv(tef=intra_neg_tef, ctx_mode=self.ctx_mode)
+            model_inputs["inter_neg_moment_video_feat"] = \
+                self.concat_feat_adv(tef=inter_neg_tef, ctx_mode=self.ctx_mode)
+        return dict(meta=meta, model_inputs=model_inputs)
+    def align_ts_to_clip_boundaries(self, duration, ts):
+        """  # TODO Do we really need this???
+        Generate a moment [st, ed] that is most close to a clip boundary,
+        st and ed must be a multiple of self.clip_length, and ed <= duration
+        duration: float,
+        ts: [st (float), ed (float)], ground_truth ts
+        """
+        clip_aligned_ts = np.array([math.floor(ts[0] / self.clip_length),
+                                    math.ceil(ts[1] / self.clip_length)]) * self.clip_length
+        clip_aligned_ts[1] = min(clip_aligned_ts[1], duration)
+        return clip_aligned_ts
+    def sample_intra_neg_moment(self, duration, ts):
+        """ Generate a intra negative moment given the video duration and the GT ts.
+        The returned moment will be aligned to clip boundaries.
+        1) neg_moment has at least 2 clips
+        2) its iou with ts should be < self.neg_iou_thd
+        Args:
+            duration: float
+            ts: [st (float), ed (float)], ground_truth ts
+        Returns:
+        """
+        max_n_search = 5  # search at most max_n_search times, so the program will not be stuck in infinite loops.
+        sampled_moments = self.sample_ts_at_clip_boundaries(duration, n_pairs=max_n_search)  # (n_pairs, 2)
+        sampled_moments_ious = compute_temporal_iou_batch(sampled_moments, ts)  # (n_pairs, )
+        smallest_iou_idx = np.argmin(sampled_moments_ious)
+        sampled_moment = sampled_moments[smallest_iou_idx]
+        # only a small number (<20 with max_n_search==10) of samples are wrong,
+        # usually when the video_duration is too short.
+        # if sampled_moments_ious[smallest_iou_idx] >= self.neg_iou_thd:
+        #     logger.warning("the sampled intra-neg might be wrong. "
+        #                    "v_dur {}, ts {}, sampled neg moment {}, iou {}"
+        #                    .format(duration, ts, sampled_moment, sampled_moments_ious[smallest_iou_idx]))
+        return sampled_moment
+    def sample_ts_at_clip_boundaries(self, duration, n_pairs=1):
+        """sample n_pairs moment at clip boundaries, each has at least two clips."""
+        # '+ self.clip_length' since we assume indexing using [clip_st_idx, clip_ed_idx),
+        moments = np.random.randint(0, np.ceil(duration / self.clip_length), size=(n_pairs, 2))
+        moments = np.sort(moments, axis=1) * self.clip_length
+        less_equal = moments[:, 1] - moments[:, 0] <= self.clip_length
+        start_zero = moments[:, 0] == 0
+        moments[:, 1][less_equal * start_zero] += self.clip_length
+        moments[:, 0][less_equal * (start_zero == False)] -= self.clip_length  # keep as bool!!!
+        return moments
+    def sample_inter_video_negative(self, pos_vid_name, normalized_pos_moment, desc_id=None):
+        """Sample a negative moment --> negative video + similar normalized moment.
+        1) they are not from the same video
+        Args:
+            pos_vid_name: str,
+            normalized_pos_moment: np.ndarray, (2, ), value in [0, 1], normalized by duration.
+            desc_id: str
+        Returns:
+            moment: np.ndarray, (2, ), ts aligned to clip boundaries.
+        """
+        use_guided_negative = hasattr(self, "desc_id2video_names_dur_pairs")
+        if use_guided_negative:
+            top_videos = self.desc_id2video_names_dur_pairs[desc_id]
+            max_idx = len(top_videos) - 1
+        while True:  # usually only run once.
+            if use_guided_negative:
+                sampled_idx = min(max_idx, int(random.expovariate(0.1)))
+                sampled_video_name, sampled_video_dur = top_videos[sampled_idx]
+            else:
+                neg_vid_data = self.data[int(random.random() * len(self))]
+                sampled_video_name, sampled_video_dur = neg_vid_data["vid_name"], neg_vid_data["duration"]
+            if sampled_video_name != pos_vid_name:
+                inter_neg_moment = self.align_ts_to_clip_boundaries(
+                    sampled_video_dur, sampled_video_dur * normalized_pos_moment)
+                break
+        return inter_neg_moment, sampled_video_name, sampled_video_dur
+    @classmethod
+    def get_clip_indices_from_moments(cls, moment, clip_length):
+        clip_st_ed_indices = moment / clip_length
+        return math.floor(clip_st_ed_indices[0]), math.ceil(clip_st_ed_indices[1])
+    def get_moment_feat(self, vid_feat, moment, normalize=True, fix_outbound=False, pool_local=False):
+        """Each moment contains multiple clips.
+        Inside means [moment[0], moment[1]] (seconds)
+        Args:
+            vid_feat: np.ndarray, (N_clips, D)
+            moment: [st (float), ed (float)], np.ndarray
+            normalize: L2 normalize features
+            fix_outbound: bool,
+            pool_local: whether to mean pool the features
+        Returns:
+            moment_feature: np.ndarray, ((moment[1] - moment[0]) / clip_length, D) or (D, )
+        """
+        clip_st_idx, clip_ed_idx = self.get_clip_indices_from_moments(moment, self.clip_length)
+        if fix_outbound:
+            vid_feat_len = len(vid_feat)
+            if clip_st_idx >= vid_feat_len:
+                clip_st_idx = vid_feat_len - 2
+        moment_feat = vid_feat[clip_st_idx:clip_ed_idx]  # indexed as [st, ed)
+        if pool_local:
+            moment_feat = np.mean(moment_feat, axis=0, keepdims=True)
+        if normalize:
+            moment_feat = l2_normalize_np_array(moment_feat)
+        return moment_feat  # (n_clip_in_moment, D) or (D, )
+    @classmethod
+    def concat_feat_adv(cls, moment_feats=None, tef=None, to_torch=True, ctx_mode="tef"):
+        """ Concat moment_feat with other_feats and tef. All the features should be L2 normalized before concatenating
+        Args:
+            moment_feats: list of feats, one of them might be None. Other possible values are
+                ctx_feat (D, ) or sub(vid)_moment_feat (N_p, N_clips, D_t) or (N_clips, D_t).
+                The first non-None feature array is used as base for the rest to concatenate with.
+            tef: (N_p, 2) or (2, ), np.ndarray
+            to_torch: convert resulting np.ndarray to torch.tensor
+            ctx_mode:
+        """
+        if ctx_mode == "tef":
+            assembled_feat = np.expand_dims(tef, axis=-2)
+        else:  # concat moment_feat with all other_feats
+            moment_feats = [e for e in moment_feats if e is not None]  # remove possible None (placeholder)
+            extra_dims = moment_feats[0].shape[:-1]  # all others will need to broadcast to match it.
+            if isinstance(extra_dims, int):  # happens when len(moment_feat.shape) == 2
+                extra_dims = (extra_dims, )
+            last_dim_lengths = [0, ] + [e.shape[-1] for e in moment_feats]
+            if "tef" in ctx_mode:  # add tef
+                last_dim_lengths += [2, ]
+                moment_feats += [np.expand_dims(tef, axis=-2), ]
+            if len(moment_feats) > 1:
+                assembled_feat = np.empty(extra_dims + (sum(last_dim_lengths), ), dtype=np.float32)
+                last_dim_lengths_cumsum = [sum(last_dim_lengths[0:idx+1]) for idx in range(len(last_dim_lengths))]
+                for idx, feat in enumerate(moment_feats):
+                    assembled_feat[..., last_dim_lengths_cumsum[idx]:last_dim_lengths_cumsum[idx+1]] = feat
+            else:
+                assembled_feat = moment_feats[0]
+        if to_torch:
+            return torch.from_numpy(assembled_feat)
+        else:
+            return assembled_feat  # (N_prop, N_clips, D_concat) or (N_clips, D_concat)
+class ProposalRetrievalEvalDataset(Dataset):
+    """
+    init_data_mode: `video_query` or `video_only` or `query_only`,
+        it indicates which data to load when initialize the Dataset object.
+    data_mode: `context` or `query`, it indicates which data to return for self.__get_item__()
+    desc_bert_path_or_handler: h5py.File object or str path
+    vid_feat_path_or_handler: h5py.File object or str path
+    eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with
+        max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals.
+    load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval.
+    data_ratio: percentage of query data to use.
+    """
+    def __init__(self, dset_name, eval_split_name, data_path=None,
+                 desc_bert_path_or_handler=None, max_desc_len=None,
+                 sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
+                 corpus_path=None, clip_length=None,
+                 eval_proposal_bsz=None, ctx_mode="tef", data_mode="context",
+                 h5driver=None, data_ratio=1.0, normalize_vfeat=True,
+                 normalize_tfeat=True, max_n_proposals=90, model_type="cal"):
+        self.dset_name = dset_name
+        self.model_type = model_type
+        self.pool_local = model_type == "mcn"  # pool local feature
+        self.eval_split_name = eval_split_name
+        self.ctx_mode = ctx_mode
+        self.load_gt_video = False
+        self.data_ratio = data_ratio  # only affect query data
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+        self.max_n_proposals = max_n_proposals
+        self.data_mode = None
+        self.set_data_mode(data_mode)
+        self.max_desc_len = max_desc_len
+        self.data_path = data_path
+        self.query_data = load_jsonl(data_path)
+        if data_ratio != 1:
+            n_examples = int(len(self.query_data) * data_ratio)
+            self.query_data = self.query_data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+        video_data = load_json(corpus_path)[self.eval_split_name]
+        self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()]
+        self.video2idx = {k: v[1] for k, v in video_data.items()}
+        self.eval_proposal_bsz = eval_proposal_bsz
+        self.clip_length = clip_length
+        self.proposal_fn = get_proposal_interface(dset_name)
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+    def set_data_mode(self, data_mode):
+        """context or query"""
+        assert data_mode in ["context", "query"]
+        self.data_mode = data_mode
+    def load_gt_vid_name_for_query(self, load_gt_video):
+        """load_gt_video: bool, affect the returned value of self._get_item_query"""
+        assert "vid_name" in self.query_data[0]
+        self.load_gt_video = load_gt_video
+    def __len__(self):
+        if self.data_mode == "context":
+            return len(self.video_data)
+        else:
+            return len(self.query_data)
+    def __getitem__(self, index):
+        if self.data_mode == "context":
+            return self._get_item_context(index)
+        else:
+            return self._get_item_query(index)
+    def _get_item_query(self, index):
+        """Need to batch"""
+        raw_data = self.query_data[index]
+        meta = dict(
+            desc_id=raw_data["desc_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"] if self.load_gt_video else None
+        )
+        model_inputs = dict()
+        query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        model_inputs["query_feat"] = torch.from_numpy(query_feat)
+        return dict(meta=meta, model_inputs=model_inputs)
+    def _get_item_context(self, index):
+        """No need to batch, since it has already been batched here"""
+        raw_data = self.video_data[index]
+        # get proposals and sort in ascending order, to get more efficient batching
+        proposals = self.proposal_fn(
+            video_id="", metadata={"duration": raw_data["duration"]})  # np.ndarray (N_p, 2)
+        proposals_lengths = proposals[:, 1] - proposals[:, 0]  # seconds
+        sorted_proposal_indices = np.argsort(proposals_lengths)[:self.max_n_proposals]
+        sorted_proposals = proposals[sorted_proposal_indices]
+        # initialize with basic data
+        meta = dict(
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+            proposals=sorted_proposals
+        )
+        model_inputs = dict()
+        n_proposal_batches = math.ceil(1.0 * len(sorted_proposals) / self.eval_proposal_bsz)
+        tef_batched_list = [None, ] * n_proposal_batches
+        t_moments_mask_list = [None, ] * n_proposal_batches
+        if self.use_tef:
+            tef_array = sorted_proposals / meta["duration"]  # (N_p, 2)
+            for batch_idx in range(n_proposal_batches):
+                st_m_idx = batch_idx * self.eval_proposal_bsz
+                ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz
+                tef_batched_list[batch_idx] = tef_array[st_m_idx:ed_m_idx]
+                t_moments_mask_list[batch_idx] = \
+                    np.ones((len(tef_batched_list[batch_idx]), 1), dtype=np.float32)
+            if not self.use_video and not self.use_sub:  # use video stream
+                model_inputs["video_moment_features_list"] = [
+                    ProposalRetrievalDataset.concat_feat_adv(tef=t, ctx_mode=self.ctx_mode) for t in tef_batched_list]
+                model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in t_moments_mask_list]
+        # extract/group/pad
+        if self.use_video:
+            v_feat = self.vid_feat_h5[meta["vid_name"]]  # (N_frm, D)
+            v_ctx_feat = np.mean(v_feat, axis=0)  # (D, )
+            if self.normalize_vfeat:
+                v_ctx_feat = l2_normalize_np_array(v_ctx_feat)
+            v_padded_moments_features_list, v_moments_mask_list = \
+                self.get_batched_moment_feat_for_all_proposals(v_feat, sorted_proposals,
+                                                               pool_local=self.pool_local,
+                                                               normalize=self.normalize_vfeat)
+            model_inputs["video_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv(
+                moment_feats=[v, v_ctx_feat], tef=t, ctx_mode=self.ctx_mode)
+                for v, t in zip(v_padded_moments_features_list, tef_batched_list)]
+            model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in v_moments_mask_list]
+        if self.use_sub:
+            s_feat = self.sub_bert_h5[meta["vid_name"]]  # (N_frm, D)
+            s_ctx_feat = np.mean(s_feat, axis=0)  # (D, )
+            if self.normalize_tfeat:
+                s_ctx_feat = l2_normalize_np_array(s_ctx_feat)
+            s_padded_moments_features_list, s_moments_mask_list = \
+                self.get_batched_moment_feat_for_all_proposals(s_feat, sorted_proposals,
+                                                               pool_local=self.pool_local,
+                                                               normalize=self.normalize_tfeat)
+            model_inputs["sub_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv(
+                moment_feats=[s, s_ctx_feat], tef=t, ctx_mode=self.ctx_mode)
+                for s, t in zip(s_padded_moments_features_list, tef_batched_list)]
+            model_inputs["sub_moment_mask_list"] = [torch.from_numpy(e) for e in s_moments_mask_list]
+        return dict(meta=meta, model_inputs=model_inputs)
+    def get_batched_moment_feat_for_all_proposals(self, feature, moments, pool_local=False, normalize=True):
+        """proposals of the same video wil be segmented into multiple batches to accomodate GPU memory
+        pool_local: pool local feature into a single vector
+        """
+        n_proposal_batches = math.ceil(1.0 * len(moments) / self.eval_proposal_bsz)
+        padded_moments_features_list = [None, ] * n_proposal_batches
+        moments_mask_list = [None, ] * n_proposal_batches
+        moments_features = self.get_moment_feat_for_all_proposals(
+            feature, moments, normalize=normalize, pool_local=pool_local)  # N_p * [(N_clips, D), ]
+        for batch_idx in range(n_proposal_batches):
+            st_m_idx = batch_idx * self.eval_proposal_bsz
+            ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz
+            padded_moments_features, moments_mask = \
+                pad_sequences_1d(moments_features[st_m_idx:ed_m_idx], dtype=np.float32)
+            padded_moments_features_list[batch_idx] = padded_moments_features
+            moments_mask_list[batch_idx] = moments_mask
+            assert np.sum(np.sum(moments_mask, axis=1) == 0) == 0, " err {}".format(moments_mask)
+        assert np.sum(np.sum(moments_mask_list[0], axis=1) == 0) == 0, " err {}".format(moments_mask_list)
+        return padded_moments_features_list, moments_mask_list
+    def get_moment_feat_for_all_proposals(self, vid_feat, moments, normalize=True, pool_local=False):
+        """Each moment is comprised of multiple clips
+        Args:
+            vid_feat: np.ndarray, (N_clips, D)
+            moments: np.ndarray, (N_p, 2), each row is [st (float), ed (float)],
+            normalize: L2 normalize
+            pool_local:
+        Returns:
+            moments_features: list(np.ndarray), [(N_clips, D), ] * N_p, N_clips is changing.
+        """
+        if normalize and not pool_local:
+            vid_feat = l2_normalize_np_array(vid_feat)
+        vid_feat_len = len(vid_feat)
+        moments_st_clip_indices = np.floor(moments[:, 0] / self.clip_length).astype(np.int64).clip(0, vid_feat_len-1)
+        moments_ed_clip_indices = np.ceil(moments[:, 1] / self.clip_length).astype(np.int64).clip(1, vid_feat_len)
+        moments_features = []
+        for st_idx, ed_idx, m in zip(moments_st_clip_indices, moments_ed_clip_indices, moments):
+            feat = vid_feat[st_idx:ed_idx]
+            if pool_local:
+                feat = np.mean(feat, axis=0, keepdims=True)
+                if normalize:
+                    feat = l2_normalize_np_array(feat)
+            moments_features.append(feat)
+        return moments_features
+def proposal_retrieval_collate(batch):
+    batch_meta = [e["meta"] for e in batch]  # seems no need to collate ?
+    model_inputs_keys = batch[0]["model_inputs"].keys()
+    batched_data = {k: pad_sequences_1d([e["model_inputs"][k] for e in batch], dtype=torch.float32)
+                    for k in model_inputs_keys}
+    return batch_meta, batched_data
+def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False):
+    model_inputs = {}
+    for k, v in batched_model_inputs.items():
+        model_inputs[k] = v[0].to(device, non_blocking=non_blocking)
+        model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking)
+    return model_inputs
+if __name__ == '__main__':
+    from baselines.clip_alignment_with_language.config import BaseOptions
+    options = BaseOptions().parse()

baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/usr/bin/env bash
+# run at project root dir
+dset_name=$1  # see case below
+split_name=$2  # train/val/test, some datasets may not support all the 3 splits
+result_dir="baselines/clip_alignment_with_language/results"
+echo "Running with dataset ${dset_name} with split ${split_name}"
+case ${dset_name} in
+    tvr)  # only supports train/val
+        eval_file_path=data/tvr_${split_name}_release.jsonl
+        save_path=${result_dir}/tvr_${split_name}_proposal_upper_bound.json
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+echo "Running evaluation"
+python baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py \
+-dset_name=${dset_name} \
+-eval_file_path=${eval_file_path} \
+-save_path=${save_path} \
+-verbose

baselines/clip_alignment_with_language/scripts/inference.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+model_dir=$1
+eval_split_name=$2
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+tasks=(VR)
+tasks+=(SVMR)
+tasks+=(VCMR)
+echo "tasks ${tasks[@]}"
+python baselines/clip_alignment_with_language/inference.py \
+--model_dir ${model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+${@:3}

baselines/clip_alignment_with_language/scripts/inference_mix.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/inference_mix.sh
+eval_model=$1  # [mcn, cal], retrain models should only be paired with mee
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval/baselines/clip_alignment_with_language/results
+# setup eval model
+if [[ ${eval_model} == mcn ]]; then
+    pred_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40
+    tef_pred_dir=tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57
+elif [[ ${eval_model} == cal ]]; then
+    pred_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59
+    tef_pred_dir=tvr-cal-video_sub_tef-res-2019_11_05_14_25_49
+fi
+pred_path=${project_root}/${pred_dir}/inference_tvr_test_public_max200_predictions_VR_SVMR_VCMR.json
+save_path=${project_root}/${pred_dir}/inference_tvr_test_public_max200_predictions_VR_SVMR_VCMR_rerank_${tef_pred_dir}.json
+tef_pred_path=${project_root}/${tef_pred_dir}/inference_tvr_test_public_max10000_predictions_VCMR.pt
+gt_path=data/tvr_test_public_archive.jsonl
+python baselines/clip_alignment_with_language/mix_model_prediction.py \
+--pred_path=${pred_path} \
+--tef_pred_path=${tef_pred_path} \
+--gt_path=${gt_path} \
+--save_path=${save_path}

baselines/clip_alignment_with_language/scripts/inference_with_external.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/inference_with_external.sh
+#model_dir=$1
+# DO not use NMS, since it gives worse results
+eval_model=$1  # [mcn, mcn_tef, cal, cal_tef, mcn_retrain, cal_retrain], retrain models should only be paired with mee
+external_model=$2  # [mee, mcn, cal]
+eval_split_name=$3
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval/baselines
+# setup eval model
+if [[ ${eval_model} == mcn ]]; then
+    eval_model_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40
+elif [[ ${eval_model} == mcn_tef ]]; then
+    eval_model_dir=tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57
+elif [[ ${eval_model} == cal ]]; then
+    eval_model_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59
+elif [[ ${eval_model} == cal_tef ]]; then
+    eval_model_dir=tvr-cal-video_sub_tef-res-2019_11_05_14_25_49
+elif [[ ${eval_model} == mcn_tef_retrain ]]; then
+    eval_model_dir=tvr-mcn-video_sub_tef-+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57+-2019_11_06_02_26_49
+elif [[ ${eval_model} == cal_tef_retrain ]]; then
+    eval_model_dir=tvr-cal-video_sub_tef-+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-cal-video_sub_tef-res-2019_11_05_14_25_49+-2019_11_06_03_12_15
+fi
+# setup external
+if [[ ${external_model} == mee ]]; then
+    external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39
+    external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json
+elif [[ ${external_model} == mcn ]]; then
+    external_model_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40
+    external_inference_vr_res_path=${project_root}/clip_alignment_with_language/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR_SVMR_VCMR.json
+elif [[ ${external_model} == cal ]]; then
+    external_model_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59
+    external_inference_vr_res_path=${project_root}/clip_alignment_with_language/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR_SVMR_VCMR.json
+fi
+tasks=(VR)
+tasks+=(SVMR)
+tasks+=(VCMR)
+echo "tasks ${tasks[@]}"
+python baselines/clip_alignment_with_language/inference.py \
+--model_dir ${eval_model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+--external_inference_vr_res_path ${external_inference_vr_res_path} \
+--eval_id ${external_model_dir} \
+${@:4}
+#--use_intermediate \  # temporary removed

baselines/clip_alignment_with_language/scripts/re_train_cal.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env bash
+lr=0.00005
+n_epoch=20
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval
+ckpt_filename="model.ckpt"
+init_ckpt_path=${project_root}/baselines/clip_alignment_with_language/results/tvr-cal-video_sub_tef-res-2019_11_05_14_25_49/${ckpt_filename}
+exp_id=+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-cal-video_sub_tef-res-2019_11_05_14_25_49+
+external_train_vr_res_path=${project_root}/baselines/mixture_embedding_experts/results/tvr-video_sub-res-2019_11_06_00_33_39/inference_tvr_train_None_predictions_VR.json
+model_type=cal
+bash baselines/clip_alignment_with_language/scripts/train.sh tvr video_sub_tef resnet_i3d \
+--no_norm_vfeat \
+--model_type ${model_type} \
+--exp_id ${exp_id} \
+--init_ckpt_path ${init_ckpt_path} \
+--external_train_vr_res_path ${external_train_vr_res_path} \
+--lr ${lr} \
+--n_epoch ${n_epoch} \
+--max_es_cnt 5 \
+${@:1}

baselines/clip_alignment_with_language/scripts/re_train_mcn.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env bash
+lr=0.00005
+n_epoch=20
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval
+ckpt_filename="model.ckpt"
+init_ckpt_path=${project_root}/baselines/clip_alignment_with_language/results/tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57/${ckpt_filename}
+exp_id=+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57+
+external_train_vr_res_path=${project_root}/baselines/mixture_embedding_experts/results/tvr-video_sub-res-2019_11_06_00_33_39/inference_tvr_train_None_predictions_VR.json
+model_type=mcn
+bash baselines/clip_alignment_with_language/scripts/train.sh tvr video_sub_tef resnet_i3d \
+--no_norm_vfeat \
+--model_type ${model_type} \
+--exp_id ${exp_id} \
+--init_ckpt_path ${init_ckpt_path} \
+--external_train_vr_res_path ${external_train_vr_res_path} \
+--lr ${lr} \
+--n_epoch ${n_epoch} \
+--max_es_cnt 5 \
+${@:1}

baselines/clip_alignment_with_language/scripts/train.sh ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS
+# if re-training, please also give --init_ckpt_path and --external_train_vr_res_path, may also use lower lr ?
+dset_name=$1  # see case below
+ctx_mode=$2  # ["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"]
+vid_feat_type=$3  # [resnet, i3d, resnet_i3d, none] , none for subtitles only models
+feature_root=data/tvr_feature_release
+results_root=baselines/clip_alignment_with_language/results
+vid_feat_size=2048
+extra_args=()
+if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+    if [[ ${dset_name} != "tvr" ]]; then
+        echo "The use of subtitles is only supported in tvr."
+        exit 1
+    fi
+fi
+case ${dset_name} in
+    tvr)
+        train_path=data/tvr_train_release.jsonl
+        corpus_path=data/tvr_video2dur_idx.json
+        desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5
+        vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+        clip_length=1.5
+        eval_split_name=val
+        nms_thd=-1
+        extra_args+=(--eval_path)
+        extra_args+=(data/tvr_val_release.jsonl)
+        if [[ ${vid_feat_type} == "i3d" ]]; then
+            echo "Using I3D feature with shape 1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5
+            vid_feat_size=1024
+        elif [[ ${vid_feat_type} == "resnet" ]]; then
+            echo "Using ResNet feature with shape 2048"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+            vid_feat_size=2048
+        elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then
+            echo "Using concatenated ResNet and I3D feature with shape 2048+1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5
+            vid_feat_size=3072
+            extra_args+=(--no_norm_vfeat)  # since they are already normalized.
+        fi
+        if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+            echo "Running with sub."
+            desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5  # overwrite
+            sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5
+            sub_feat_size=768
+            extra_args+=(--sub_feat_size)
+            extra_args+=(${sub_feat_size})
+            extra_args+=(--sub_bert_path)
+            extra_args+=(${sub_bert_path})
+        fi
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]"
+echo "Extra args ${extra_args[@]}"
+python baselines/clip_alignment_with_language/train.py \
+--dset_name=${dset_name} \
+--eval_split_name=${eval_split_name} \
+--nms_thd=${nms_thd} \
+--results_root=${results_root} \
+--train_path=${train_path} \
+--desc_bert_path=${desc_bert_path} \
+--corpus_path=${corpus_path} \
+--vid_feat_path=${vid_feat_path} \
+--clip_length=${clip_length} \
+--vid_feat_size=${vid_feat_size} \
+--ctx_mode=${ctx_mode} \
+${extra_args[@]} \
+${@:4}

baselines/clip_alignment_with_language/train.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import os
+import time
+import json
+import pprint
+import random
+import numpy as np
+from collections import OrderedDict
+from easydict import EasyDict as EDict
+from tqdm import tqdm, trange
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from baselines.clip_alignment_with_language.config import BaseOptions
+from baselines.clip_alignment_with_language.model import CALWithSub
+from baselines.clip_alignment_with_language.proposal_retrieval_dataset import \
+    ProposalRetrievalDataset, proposal_retrieval_collate, ProposalRetrievalEvalDataset, prepare_batch_inputs
+from baselines.clip_alignment_with_language.inference import eval_epoch, start_inference
+from utils.basic_utils import save_jsonl, save_json, AverageMeter
+from utils.model_utils import count_parameters
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+def set_seed(seed, use_cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if use_cuda:
+        torch.cuda.manual_seed_all(seed)
+def train_epoch(model, train_loader, optimizer, opt, epoch_i):
+    model.train()
+    # init meters
+    dataloading_time = AverageMeter()
+    prepare_inputs_time = AverageMeter()
+    model_forward_time = AverageMeter()
+    model_backward_time = AverageMeter()
+    loss_meter = AverageMeter()
+    num_training_examples = len(train_loader)
+    timer_dataloading = time.time()
+    for batch_idx, batch in tqdm(enumerate(train_loader),
+                                 desc="Training Iteration",
+                                 total=num_training_examples):
+        dataloading_time.update(time.time() - timer_dataloading)
+        # continue
+        timer_start = time.time()
+        model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory)
+        prepare_inputs_time.update(time.time() - timer_start)
+        # logger.info("model_inputs {}"
+        #             .format({k: (type(k), v.shape if isinstance(v, torch.Tensor) else v)
+        #                      for k, v in model_inputs.items()}))
+        # logger.info("model_inputs \n{}".format({k: (type(v), v.shape, v.dtype) for k, v in model_inputs.items()}))
+        timer_start = time.time()
+        loss = model(**model_inputs)
+        model_forward_time.update(time.time() - timer_start)
+        timer_start = time.time()
+        optimizer.zero_grad()
+        loss.backward()
+        if opt.grad_clip != -1:
+            nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
+        optimizer.step()
+        model_backward_time.update(time.time() - timer_start)
+        global_step = epoch_i * num_training_examples + batch_idx
+        opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step)
+        opt.writer.add_scalar("Train/Loss", float(loss), global_step)
+        loss_meter.update(float(loss))
+        timer_dataloading = time.time()
+        if opt.debug and batch_idx == 3:
+            break
+    to_write = opt.train_log_txt_formatter.format(
+        time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+        epoch=epoch_i,
+        loss_str=str(loss_meter.avg))
+    with open(opt.train_log_filepath, "a") as f:
+        f.write(to_write)
+    print("Epoch time stats:")
+    print("dataloading_time: max {dataloading_time.max} "
+          "min {dataloading_time.min} avg {dataloading_time.avg}\n"
+          "prepare_inputs_time: max {prepare_inputs_time.max} "
+          "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n"
+          "model_forward_time: max {model_forward_time.max} "
+          "min {model_forward_time.min} avg {model_forward_time.avg}\n"
+          "model_backward_time: max {model_backward_time.max} "
+          "min {model_backward_time.min} avg {model_backward_time.avg}\n"
+          "".format(dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time,
+                    model_forward_time=model_forward_time, model_backward_time=model_backward_time))
+def train(model, train_dataset, val_dataset, opt):
+    # Prepare optimizer
+    optimizer = torch.optim.SGD(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=opt.lr,
+        weight_decay=opt.wd,
+        momentum=opt.momentum)
+    # reduce the lr by 0.1 every 30 epochs
+    scheduler = torch.optim.lr_scheduler.StepLR(
+        optimizer,
+        step_size=30,
+        gamma=0.1
+    )
+    train_loader = DataLoader(train_dataset,
+                              collate_fn=proposal_retrieval_collate,
+                              batch_size=opt.bsz,
+                              num_workers=opt.num_workers,
+                              shuffle=True,
+                              pin_memory=opt.pin_memory)
+    prev_best_score = 0.
+    es_cnt = 0
+    start_epoch = -1 if opt.eval_untrained else 0
+    eval_tasks_at_training = ["SVMR", ]
+    save_submission_filename = \
+        "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, "_".join(eval_tasks_at_training))
+    for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"):
+        if epoch_i > -1:
+            with torch.autograd.detect_anomaly():
+                train_epoch(model, train_loader, optimizer, opt, epoch_i)
+        global_step = (epoch_i + 1) * len(train_loader)
+        scheduler.step()
+        if opt.eval_path is not None:
+            with torch.no_grad():
+                metrics_no_nms, metrics_nms, latest_file_paths = \
+                    eval_epoch(model, val_dataset, opt, save_submission_filename, tasks=eval_tasks_at_training,
+                               max_before_nms=300, max_after_nms=100)
+            logger.info("metrics_no_nms {}".format(
+                pprint.pformat(rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4)))
+            logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+            to_write = opt.eval_log_txt_formatter.format(
+                time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+                epoch=epoch_i,
+                eval_metrics_str=json.dumps(metrics_no_nms))
+            with open(opt.eval_log_filepath, "a") as f:
+                f.write(to_write)
+            # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms
+            metrics = metrics_no_nms
+            # early stop/ log / save model
+            for task_type, task_metrics in metrics.items():
+                for iou_thd in [0.5, 0.7]:
+                    opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd),
+                                           {k: v for k, v in task_metrics.items() if str(iou_thd) in k},
+                                           global_step)
+            # use the most strict metric available
+            if metrics["SVMR"]["0.5-r1"] > prev_best_score:
+                es_cnt = 0
+                prev_best_score = metrics["SVMR"]["0.5-r1"]
+                checkpoint = {
+                    "model": model.state_dict(),
+                    "model_cfg": model.config,
+                    "epoch": epoch_i}
+                torch.save(checkpoint, opt.ckpt_filepath)
+                best_file_paths = [e.replace("latest", "best") for e in latest_file_paths]
+                for src, tgt in zip(latest_file_paths, best_file_paths):
+                    os.renames(src, tgt)
+                logger.info("The checkpoint file has been updated.")
+            else:
+                es_cnt += 1
+                if es_cnt > opt.max_es_cnt:  # early stop
+                    with open(opt.train_log_filepath, "a") as f:
+                        f.write("Early Stop at epoch {}".format(epoch_i))
+                    logger.info("Early stop at {} with SVMR 0.5-r1 {}".format(epoch_i, prev_best_score))
+                    break
+        else:
+            checkpoint = {
+                "model": model.state_dict(),
+                "model_cfg": model.config,
+                "epoch": epoch_i}
+            torch.save(checkpoint, opt.ckpt_filepath)
+        if opt.debug:
+            break
+    opt.writer.close()
+def rm_key_from_odict(odict_obj, rm_suffix):
+    """remove key entry from the OrderedDict"""
+    return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k])
+def start_training():
+    logger.info("Setup config, data and model...")
+    opt = BaseOptions().parse()
+    set_seed(opt.seed)
+    if opt.debug:  # keep the model run deterministically
+        # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
+        # Enable this only when input size is fixed.
+        cudnn.benchmark = False
+        cudnn.deterministic = True
+    opt.writer = SummaryWriter(opt.tensorboard_log_dir)
+    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
+    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"
+    train_dataset = ProposalRetrievalDataset(
+        dset_name=opt.dset_name,
+        model_type=opt.model_type,
+        data_path=opt.train_path,
+        desc_bert_path=opt.desc_bert_path,
+        sub_bert_path=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        vid_feat_path=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        vid_feat_size=opt.vid_feat_size,
+        sub_feat_size=opt.sub_feat_size,
+        ctx_mode=opt.ctx_mode,
+        pos_iou_thd=opt.pos_iou_thd,
+        neg_iou_thd=opt.neg_iou_thd,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+        external_train_vr_res_path=opt.external_train_vr_res_path,  # If not None, used to guide negative sampling
+        corpus_path=opt.corpus_path,
+    )
+    if opt.eval_path is not None:
+        eval_dataset = ProposalRetrievalEvalDataset(
+            dset_name=opt.dset_name,
+            model_type=opt.model_type,
+            eval_split_name=opt.eval_split_name,  # should only be val set
+            data_path=opt.eval_path,
+            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
+            sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
+            max_desc_len=opt.max_desc_l,
+            corpus_path=opt.corpus_path,
+            vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
+            clip_length=opt.clip_length,
+            eval_proposal_bsz=opt.eval_proposal_bsz,
+            ctx_mode=opt.ctx_mode,
+            data_mode="query",
+            h5driver=opt.h5driver,
+            data_ratio=opt.data_ratio,
+            normalize_vfeat=not opt.no_norm_vfeat,
+            normalize_tfeat=not opt.no_norm_tfeat,
+        )
+    else:
+        eval_dataset = None
+    model_config = EDict(
+        visual_input_size=train_dataset.vid_feat_output_size,  # changes based on visual input type
+        textual_input_size=train_dataset.sub_feat_output_size,
+        query_feat_size=opt.desc_feat_size,
+        visual_hidden_size=opt.visual_hidden_size,  #
+        output_size=opt.output_size,
+        embedding_size=opt.embedding_size,
+        lstm_hidden_size=opt.lstm_hidden_size,
+        margin=opt.margin,  # margin for ranking loss
+        loss_type=opt.loss_type,  # loss type, 'hinge' or 'lse'
+        inter_loss_weight=opt.inter_loss_weight * (opt.ctx_mode == "tef"),  # weight for inter negatives
+        ctx_mode=opt.ctx_mode
+    )
+    logger.info("model_config {}".format(model_config))
+    model = CALWithSub(model_config)
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    if opt.init_ckpt_path is not None:
+        checkpoint = torch.load(opt.init_ckpt_path)
+        model.load_state_dict(checkpoint["model"])
+        logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                    .format(checkpoint["epoch"], opt.init_ckpt_path))
+    count_parameters(model)
+    logger.info("Start Training...")
+    train(model, train_dataset, eval_dataset, opt)
+    return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
+if __name__ == '__main__':
+    model_dir, eval_split_name, eval_path, debug = start_training()
+    if not debug:
+        model_dir = model_dir.split(os.sep)[-1]
+        tasks = ["SVMR", "VCMR"]
+        input_args = ["--model_dir", model_dir,
+                      "--eval_split_name", eval_split_name,
+                      "--eval_path", eval_path,
+                      "--tasks"] + tasks
+        import sys
+        sys.argv[1:] = input_args
+        logger.info("\n\n\nFINISHED TRAINING!!!")
+        logger.info("Evaluating model in {}".format(model_dir))
+        start_inference()

baselines/crossmodal_moment_localization/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Cross-modal Moment Localization (XML)
2	+ ===

baselines/crossmodal_moment_localization/__init__.py ADDED Viewed

File without changes

baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (207 Bytes). View file

baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (23.3 kB). View file

baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc ADDED Viewed

Binary file (24.1 kB). View file

baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc ADDED Viewed

Binary file (19.8 kB). View file

baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc ADDED Viewed

Binary file (39.8 kB). View file

baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc ADDED Viewed

Binary file (5.64 kB). View file

baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc ADDED Viewed

Binary file (18.8 kB). View file

baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc ADDED Viewed

Binary file (19.5 kB). View file

baselines/crossmodal_moment_localization/config.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import os
+import time
+import torch
+import argparse
+from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
+from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs
+class BaseOptions(object):
+    saved_option_filename = "opt.json"
+    ckpt_filename = "model.ckpt"
+    tensorboard_log_dir = "tensorboard_log"
+    train_log_filename = "train.log.txt"
+    eval_log_filename = "eval.log.txt"
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        self.initialized = False
+        self.opt = None
+    def initialize(self):
+        self.initialized = True
+        self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
+        self.parser.add_argument("--model_name", type=str)
+        self.parser.add_argument("--eval_split_name", type=str, default="val",
+                                 help="should match keys in corpus_path, must set for VCMR")
+        self.parser.add_argument("--debug", action="store_true",
+                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
+        self.parser.add_argument("--data_ratio", type=float, default=1.0,
+                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
+                                      "Use small portion for debug purposes. Note this is different from --debug, "
+                                      "which works by breaking the loops, typically they are not used together.")
+        self.parser.add_argument("--results_root", type=str, default="results")
+        self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
+        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
+        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
+        self.parser.add_argument("--num_workers", type=int, default=4,
+                                 help="num subprocesses used to load the data, 0: use main process")
+        self.parser.add_argument("--no_core_driver", action="store_true",
+                                 help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
+        self.parser.add_argument("--no_pin_memory", action="store_true",
+                                 help="Don't use pin_memory=True for dataloader. "
+                                      "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")
+        # training config
+        self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+        self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01,
+                                 help="Proportion of training to perform linear learning rate warmup for. "
+                                      "E.g., 0.1 = 10% of training.")
+        self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
+        self.parser.add_argument("--n_epoch", type=int, default=100, help="number of epochs to run")
+        self.parser.add_argument("--max_es_cnt", type=int, default=10,
+                                 help="number of epochs to early stop, use -1 to disable early stop")
+        self.parser.add_argument("--stop_task", type=str, default="VCMR", choices=["VCMR", "SVMR", "VR"],
+                                 help="Use metric associated with stop_task for early stop")
+        self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+",
+                                 default=["VCMR"], choices=["VCMR", "SVMR", "VR"],
+                                 help="evaluate and report  numbers for tasks specified here.")
+        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+        self.parser.add_argument("--eval_query_bsz", type=int, default=50,
+                                 help="mini-batch size at inference, for query")
+        self.parser.add_argument("--eval_context_bsz", type=int, default=200,
+                                 help="mini-batch size at inference, for video/sub")
+        self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
+        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
+        self.parser.add_argument("--margin", type=float, default=0.1, help="margin for   hinge loss")
+        self.parser.add_argument("--lw_neg_q", type=float, default=1,
+                                 help="weight for ranking loss with negative query and positive context")
+        self.parser.add_argument("--lw_neg_ctx", type=float, default=1,
+                                 help="weight for ranking loss with positive query and negative context")
+        self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss")
+        self.parser.add_argument("--train_span_start_epoch", type=int, default=0,
+                                 help="which epoch to start training span prediction, -1 to disable")
+        self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"],
+                                 help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
+        self.parser.add_argument("--hard_negtiave_start_epoch", type=int, default=20,
+                                 help="which epoch to start hard negative sampling for video-level ranking loss,"
+                                      "use -1 to disable")
+        self.parser.add_argument("--hard_pool_size", type=int, default=20,
+                                 help="hard negatives are still sampled, but from a harder pool.")
+        # Model and Data config
+        self.parser.add_argument("--max_sub_l", type=int, default=50,
+                                 help="max length of all sub sentence 97.71 under 50 for 3 sentences")
+        self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
+        self.parser.add_argument("--max_ctx_l", type=int, default=100,
+                                 help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
+        self.parser.add_argument("--train_path", type=str, default=None)
+        self.parser.add_argument("--val_path", type=str, default=None)
+        self.parser.add_argument("--test_path", type=str, default=None)
+        self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None,
+                                 help="if set, use external video retrieval results to guide evaluation. ")
+        self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features")
+        self.parser.add_argument("--word2idx_path", type=str,
+                                 help="a dict, {word: word_idx, ...}, "
+                                      "special tokens are {<pad>: 0, <unk>: 1, <eos>: 2}")
+        self.parser.add_argument("--vocab_size", type=int, default=-1,
+                                 help="Set automatically to len(word2idx)")
+        self.parser.add_argument("--glove_path", type=str,
+                                 help="path to file containing the GloVe embeddings for words in word2idx")
+        self.parser.add_argument("--desc_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--ctx_mode", type=str, choices=["video", "sub", "video_sub", "tef",
+                                                                  "video_tef", "sub_tef", "video_sub_tef"],
+                                 help="which context to use. a combination of [video, sub, tef]")
+        self.parser.add_argument("--corpus_path", type=str, default=None)
+        self.parser.add_argument("--vid_feat_path", type=str, default="")
+        self.parser.add_argument("--no_norm_vfeat", action="store_true",
+                                 help="Do not do normalization on video feat, use it only when using resnet_i3d feat")
+        self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
+        self.parser.add_argument("--clip_length", type=float, default=None,
+                                 help="each video will be uniformly segmented into small clips, "
+                                      "will automatically loaded from ProposalConfigs if None")
+        self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
+        self.parser.add_argument("--span_predictor_type", type=str, default="conv", choices=["conv", "cat_linear"],
+                                 help="how to generate span predictions, "
+                                      "conv: apply 1D-Conv layer on top of NxL dot product of query and clips"
+                                      "cat_linear: cat the query and clips then use a linear layer to give output. "
+                                      "Note cat_linear is implemented as first project query and clips into scores, "
+                                      "separately, then sum them up, this should be similar to first cat then project.")
+        self.parser.add_argument("--stack_conv_predictor_conv_kernel_sizes", type=int, default=-1, nargs="+",
+                                 help="combine the results from conv edge detectors of all sizes specified."
+                                      "-1: disable. If specified, will ignore --conv_kernel_size option."
+                                      "This flag is only used when --merge_two_stream and --span_predictor_type conv!")
+        self.parser.add_argument("--encoder_type", type=str, default="transformer",
+                                 choices=["gru", "lstm", "transformer", "cnn"])
+        self.parser.add_argument("--add_pe_rnn", action="store_true",
+                                 help="Add positional encoding for GRU and LSTM encoder as well")
+        self.parser.add_argument("--no_merge_two_stream", action="store_true", help="do not merge video and subtitles")
+        self.parser.add_argument("--no_cross_att", action="store_true",
+                                 help="Use cross-attention for modeling video and subtitles")
+        self.parser.add_argument("--no_self_att", action="store_true", help="do not use self attention")
+        self.parser.add_argument("--no_modular", action="store_true", help="do not use modular attention")
+        self.parser.add_argument("--pe_type", type=str, default="cosine", choices=["none", "linear", "cosine"],
+                                 help="Only for query encoding")
+        self.parser.add_argument("--max_position_embeddings", type=int, default=300)
+        self.parser.add_argument("--hidden_size", type=int, default=256)
+        self.parser.add_argument("--n_heads", type=int, default=4)
+        self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs")
+        self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers")
+        self.parser.add_argument("--cross_att_drop", type=float, default=0.1, help="Applied to cross-att")
+        self.parser.add_argument("--conv_kernel_size", type=int, default=5)
+        self.parser.add_argument("--conv_stride", type=int, default=1)
+        self.parser.add_argument("--initializer_range", type=float, default=0.02,
+                                 help="initializer range for linear layer")
+        self.parser.add_argument("--eval_num_per_epoch", type=float)
+        # post processing
+        self.parser.add_argument("--min_pred_l", type=int, default=2,
+                                 help="constrain the [st, ed] with ed - st >= 2"
+                                      "(2 clips with length 1.5 each, 3 secs in total"
+                                      "this is the min length for proposal-based method)")
+        self.parser.add_argument("--max_pred_l", type=int, default=16,
+                                 help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total"
+                                      "(16 clips with length 1.5 each, "
+                                      "this is the max length for proposal-based method)")
+        self.parser.add_argument("--q2c_alpha", type=float, default=20,
+                                 help="give more importance to top scored videos' spans,  "
+                                      "the new score will be: s_new = exp(alpha * s), "
+                                      "higher alpha indicates more importance. Note s in [-1, 1]")
+        self.parser.add_argument("--max_before_nms", type=int, default=200)
+        self.parser.add_argument("--max_vcmr_video", type=int, default=100,
+                                 help="re-ranking in top-max_vcmr_video")
+        self.parser.add_argument("--nms_thd", type=float, default=-1,
+                                 help="additionally use non-maximum suppression "
+                                      "(or non-minimum suppression for distance)"
+                                      "to post-processing the predictions. "
+                                      "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,")
+    def display_save(self, opt):
+        args = vars(opt)
+        # Display settings
+        print("------------ Options -------------\n{}\n-------------------"
+              .format({str(k): str(v) for k, v in sorted(args.items())}))
+        # Save settings
+        if not isinstance(self, TestOptions):
+            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
+            save_json(args, option_file_path, save_pretty=True)
+    def parse(self):
+        if not self.initialized:
+            self.initialize()
+        opt = self.parser.parse_args()
+        if opt.debug:
+            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
+            opt.no_core_driver = True
+            opt.num_workers = 0
+            opt.eval_query_bsz = 100
+        if isinstance(self, TestOptions):
+            # modify model_dir to absolute path
+            opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
+            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
+            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
+                if arg not in ["results_root", "num_workers", "nms_thd", "debug",
+                               "eval_split_name", "eval_path", "eval_query_bsz", "eval_context_bsz",
+                               "max_pred_l", "min_pred_l", "external_inference_vr_res_path"]:
+                    setattr(opt, arg, saved_options[arg])
+            # opt.no_core_driver = True
+        else:
+            if opt.exp_id is None:
+                raise ValueError("--exp_id is required for at a training option!")
+            if opt.clip_length is None:
+                opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
+                print("Loaded clip_length {} from proposal config file".format(opt.clip_length))
+            opt.results_dir = os.path.join(opt.results_root, "_".join([opt.model_name, opt.exp_id, time.strftime("%Y%m%d_%H%M%S")]))
+            mkdirp(opt.results_dir)
+            # save a copy of current code
+            code_dir = os.path.dirname(os.path.realpath(__file__))
+            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
+            make_zipfile(code_dir, code_zip_filename,
+                         enclosing_dir="code",
+                         exclude_dirs_substring="results",
+                         exclude_dirs=["results", "debug_results", "__pycache__"],
+                         exclude_extensions=[".pyc", ".ipynb", ".swap"],)
+        self.display_save(opt)
+        if "sub" in opt.ctx_mode:
+            assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
+        if opt.hard_negtiave_start_epoch != -1:
+            if opt.hard_pool_size > opt.bsz:
+                print("[WARNING] hard_pool_size is larger than bsz")
+        assert opt.stop_task in opt.eval_tasks_at_training
+        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
+        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
+        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
+        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
+        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
+        opt.h5driver = None if opt.no_core_driver else "core"
+        # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
+        opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
+        opt.pin_memory = not opt.no_pin_memory
+        if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
+            assert opt.no_norm_vfeat
+        if "tef" in opt.ctx_mode and "video" in opt.ctx_mode:
+            opt.vid_feat_size += 2
+        if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode:
+            opt.sub_feat_size += 2
+        if "video" not in opt.ctx_mode or "sub" not in opt.ctx_mode:
+            opt.no_merge_two_stream = True
+            opt.no_cross_att = True
+        self.opt = opt
+        return opt
+class TestOptions(BaseOptions):
+    """add additional options for evaluating"""
+    def initialize(self):
+        BaseOptions.initialize(self)
+        # also need to specify --eval_split_name
+        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
+        self.parser.add_argument("--model_dir", type=str,
+                                 help="dir contains the model file, will be converted to absolute path afterwards")
+        self.parser.add_argument("--tasks", type=str, nargs="+",
+                                 choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"],
+                                 help="Which tasks to run."
+                                      "VCMR: Video Corpus Moment Retrieval;"
+                                      "SVMR: Single Video Moment Retrieval;"
+                                      "VR: regular Video Retrieval. (will be performed automatically with VCMR)")

baselines/crossmodal_moment_localization/inference.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import os
+import copy
+import math
+import time
+import pprint
+from tqdm import tqdm, trange
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from baselines.crossmodal_moment_localization.config import TestOptions
+from baselines.crossmodal_moment_localization.model_xml import XML
+from baselines.crossmodal_moment_localization.start_end_dataset import \
+    start_end_collate, StartEndEvalDataset, prepare_batch_inputs
+from baselines.clip_alignment_with_language.inference import \
+    get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms
+from utils.basic_utils import save_json, load_json
+from utils.tensor_utils import find_max_triples_from_upper_triangle_product
+from standalone_eval.eval import eval_retrieval
+import logging
+from ndcg_iou_topk import calculate_ndcg_iou
+def compute_context_info(model, eval_dataset, opt):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated 2200 (videos) * 100 (frm) * 500 (hsz) * 4 (B) * 2 (video/sub) * 2 (layers) / (1024 ** 2) = 1.76 GB
+    max_n_videos: only consider max_n_videos videos for each query to return st_ed scores.
+    """
+    model.eval()
+    # eval_dataset.set_data_mode("context")
+    context_dataloader = DataLoader(eval_dataset,
+                                    collate_fn=start_end_collate,
+                                    batch_size=opt.eval_context_bsz,
+                                    num_workers=opt.num_workers,
+                                    shuffle=False,
+                                    pin_memory=opt.pin_memory)
+    metas = []  # list(dicts)
+    video_feat1 = []
+    video_feat2 = []
+    video_mask = []
+    sub_feat1 = []
+    sub_feat2 = []
+    sub_mask = []
+    for idx, batch in tqdm(enumerate(context_dataloader),
+                           desc="Computing query2video scores",
+                           total=len(context_dataloader)):
+        metas.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        _video_feat1, _video_feat2, _sub_feat1, _sub_feat2 = model.encode_context(
+            model_inputs["video_feat"], model_inputs["video_mask"],
+            model_inputs["sub_feat"], model_inputs["sub_mask"])
+        if "video" in opt.ctx_mode:
+            video_feat1.append(_video_feat1)
+            video_feat2.append(_video_feat2)
+            video_mask.append(model_inputs["video_mask"])
+        if "sub" in opt.ctx_mode:
+            sub_feat1.append(_sub_feat1)
+            sub_feat2.append(_sub_feat2)
+            sub_mask.append(model_inputs["sub_mask"])
+    def cat_tensor(tensor_list):
+        if len(tensor_list) == 0:
+            return None
+        else:
+            seq_l = [e.shape[1] for e in tensor_list]
+            b_sizes = [e.shape[0] for e in tensor_list]
+            b_sizes_cumsum = np.cumsum([0] + b_sizes)
+            if len(tensor_list[0].shape) == 3:
+                hsz = tensor_list[0].shape[2]
+                res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l), hsz)
+            elif len(tensor_list[0].shape) == 2:
+                res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l))
+            else:
+                raise ValueError("Only support 2/3 dimensional tensors")
+            for i, e in enumerate(tensor_list):
+                res_tensor[b_sizes_cumsum[i]:b_sizes_cumsum[i+1], :seq_l[i]] = e
+            return res_tensor
+    return metas, dict(
+        video_feat1=cat_tensor(video_feat1),  # (N_videos, L, hsz),
+        video_feat2=cat_tensor(video_feat2),
+        video_mask=cat_tensor(video_mask),  # (N_videos, L)
+        sub_feat1=cat_tensor(sub_feat1),
+        sub_feat2=cat_tensor(sub_feat2),
+        sub_mask=cat_tensor(sub_mask),
+    )
+def index_if_not_none(input_tensor, indices):
+    if input_tensor is None:
+        return input_tensor
+    else:
+        return input_tensor[indices]
+def generate_min_max_length_mask(array_shape, min_l, max_l):
+    """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked,
+    below is the case for 4x4.
+    [[0, 1, 1, 0],
+     [0, 0, 1, 1],
+     [0, 0, 0, 1],
+     [0, 0, 0, 0]]
+    Args:
+        array_shape: np.shape??? The last two dimensions should be the same
+        min_l: int, minimum length of predicted span
+        max_l: int, maximum length of predicted span
+    Returns:
+    """
+    single_dims = (1, ) * (len(array_shape) - 2)
+    mask_shape = single_dims + array_shape[-2:]
+    extra_length_mask_array = np.ones(mask_shape, dtype=np.float32)  # (1, ..., 1, L, L)
+    mask_triu = np.triu(extra_length_mask_array, k=min_l)
+    mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l)
+    final_prob_mask = mask_triu * mask_triu_reversed
+    return final_prob_mask  # with valid bit to be 1
+def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx,
+                                  clip_length, min_pred_l, max_pred_l, max_before_nms):
+    """
+    Args:
+        svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1]
+        svmr_gt_ed_probs:
+        query_metas:
+        video2idx:
+        clip_length: float, how long each clip is in seconds
+        min_pred_l: int, minimum number of clips
+        max_pred_l: int, maximum number of clips
+        max_before_nms: get top-max_before_nms predictions for each query
+    Returns:
+    """
+    svmr_res = []
+    query_vid_names = [e["vid_name"] for e in query_metas]
+    # masking very long ones! Since most are relatively short.
+    st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs)  # (N, L, L)
+    # extra_length_mask_array = np.ones(st_ed_prob_product.shape, dtype=bool)  # (N, L, L)
+    # mask_triu = np.triu(extra_length_mask_array, k=min_pred_l)
+    # mask_triu_reversed = np.logical_not(np.triu(extra_length_mask_array, k=max_pred_l))
+    # final_prob_mask = np.logical_and(mask_triu, mask_triu_reversed)  # with valid bit to be 1
+    valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l)
+    st_ed_prob_product *= valid_prob_mask  # invalid location will become zero!
+    batched_sorted_triples = find_max_triples_from_upper_triangle_product(
+        st_ed_prob_product, top_n=max_before_nms, prob_thd=None)
+    for i, q_vid_name in tqdm(enumerate(query_vid_names),
+                              desc="[SVMR] Loop over queries to generate predictions",
+                              total=len(query_vid_names)):  # i is query_id
+        q_m = query_metas[i]
+        video_idx = video2idx[q_vid_name]
+        _sorted_triples = batched_sorted_triples[i]
+        _sorted_triples[:, 1] += 1  # as we redefined ed_idx, which is inside the moment.
+        _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()]
+        cur_query_pred = dict(
+            query_id=q_m["query_id"],
+            desc=q_m["desc"],
+            predictions=cur_ranked_predictions
+        )
+        svmr_res.append(cur_query_pred)
+    return svmr_res
+def load_external_vr_res2(external_vr_res_path, top_n_vr_videos=5):
+    """return a mapping from query_id to top retrieved video info"""
+    external_vr_res = load_json(external_vr_res_path)
+    external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"]
+    query2video = {e["query_id"]: e["predictions"] for e in external_vr_res}
+    return query2video
+def compute_query2ctx_info(model, eval_dataset, opt, video_metas, ctx_info,
+                           max_before_nms=1000, max_n_videos=100, maxtopk=40):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB
+    max_n_videos: int, use max_n_videos videos for computing VCMR/VR results
+    """
+    video2idx = eval_dataset.video2idx
+    # video_metas = ctx_info["video_metas"]
+    if opt.external_inference_vr_res_path is not None:
+        video_idx2meta_idx = {video2idx[m["vid_name"]]: i for i, m in enumerate(video_metas)}
+        external_query2video = \
+            load_external_vr_res2(opt.external_inference_vr_res_path, top_n_vr_videos=max_n_videos)
+        # 「query idx： [video meta idx]」
+        external_query2video_meta_idx = \
+            {k: [video_idx2meta_idx[e[0]] for e in v] for k, v in external_query2video.items()}
+    else:
+        external_query2video = None
+        external_query2video_meta_idx = None
+    model.eval()
+    eval_dataset.set_data_mode("query")
+    # eval_dataset.load_gt_vid_name_for_query(is_svmr)
+    query_eval_loader = DataLoader(eval_dataset,
+                                   collate_fn=start_end_collate,
+                                   batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers,
+                                   shuffle=False,
+                                   pin_memory=opt.pin_memory)
+    n_total_videos = len(video_metas)
+    n_total_query = len(eval_dataset)
+    bsz = opt.eval_query_bsz
+    flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int)
+    flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32)
+    sorted_q2c_indices = np.empty((n_total_query, max_n_videos), dtype=int)
+    sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32)
+    query_metas = []
+    for idx, batch in tqdm(
+            enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)):
+        _query_metas = batch[0]
+        query_metas.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        # query_context_scores (_N_q, N_videos), st_prob, ed_prob (_N_q, N_videos, L)
+        _query_context_scores, _st_probs, _ed_probs = \
+            model.get_pred_from_raw_query(model_inputs["query_feat"], model_inputs["query_mask"],
+                                          ctx_info["video_feat1"], ctx_info["video_feat2"],
+                                          ctx_info["video_mask"],
+                                          ctx_info["sub_feat1"], ctx_info["sub_feat2"],
+                                          ctx_info["sub_mask"],
+                                          cross=True)
+        # _query_context_scores = _query_context_scores + 1  # move cosine similarity to [0, 2]
+        # To give more importance to top scores, the higher opt.alpha is the more importance will be given
+        _query_context_scores = torch.exp(opt.q2c_alpha * _query_context_scores)
+        # normalize to get true probabilities!!!
+        # the probabilities here are already (pad) masked, so only need to do softmax
+        _st_probs = F.softmax(_st_probs, dim=-1)  # (_N_q, N_videos, L)
+        _ed_probs = F.softmax(_ed_probs, dim=-1)
+        if external_query2video is None:
+            _sorted_q2c_scores, _sorted_q2c_indices = \
+                torch.topk(_query_context_scores, max_n_videos, dim=1, largest=True)
+        else:
+            relevant_video_info = [external_query2video[qm["query_id"]] for qm in _query_metas]
+            _sorted_q2c_indices = _query_context_scores.new(
+                [[video_idx2meta_idx[sub_e[0]] for sub_e in e] for e in relevant_video_info]).long()
+            _sorted_q2c_scores = _query_context_scores.new(
+                [[sub_e[3] for sub_e in e] for e in relevant_video_info])
+            _sorted_q2c_scores = torch.exp(opt.q2c_alpha * _sorted_q2c_scores)
+        # collect data for vr and vcmr
+        sorted_q2c_indices[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_indices.cpu().numpy()
+        sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_scores.cpu().numpy()
+        # Get VCMR results
+        # compute combined scores
+        row_indices = torch.arange(0, len(_st_probs), device=opt.device).unsqueeze(1)
+        _st_probs = _st_probs[row_indices, _sorted_q2c_indices]  # (_N_q, max_n_videos, L)
+        _ed_probs = _ed_probs[row_indices, _sorted_q2c_indices]
+        # (_N_q, max_n_videos, L, L)
+        _st_ed_scores = torch.einsum("qvm,qv,qvn->qvmn", _st_probs, _sorted_q2c_scores, _ed_probs)
+        valid_prob_mask = generate_min_max_length_mask(
+            _st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l)
+        _st_ed_scores *= torch.from_numpy(
+            valid_prob_mask).to(_st_ed_scores.device)  # invalid location will become zero!
+        # sort across the top-max_n_videos videos (by flatten from the 2nd dim)
+        # the indices here are local indices, not global indices
+        _n_q = _st_ed_scores.shape[0]
+        _flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1)  # (N_q, max_n_videos*L*L)
+        _flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \
+            torch.sort(_flat_st_ed_scores, dim=1, descending=True)
+        # collect data
+        flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_sorted_scores[:, :max_before_nms].cpu().numpy()
+        flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_scores_sorted_indices[:, :max_before_nms].cpu().numpy()
+        if opt.debug:
+            break
+    vcmr_res = {}
+    for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm(
+            enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)),
+            desc="[VCMR] Loop over queries to generate predictions", total=n_total_query):  # i is query_idx
+        # list([video_idx(int), st(float), ed(float), score(float)])
+        video_meta_indices_local, pred_st_indices, pred_ed_indices = \
+            np.unravel_index(_flat_st_ed_scores_sorted_indices,
+                                shape=(max_n_videos, opt.max_ctx_l, opt.max_ctx_l))
+        # video_meta_indices_local refers to the indices among the top-max_n_videos
+        # video_meta_indices refers to the indices in all the videos, which is the True indices
+        video_meta_indices = sorted_q2c_indices[i, video_meta_indices_local]
+        pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length
+        pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length
+        cur_vcmr_redictions = []
+        for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices, _flat_st_ed_sorted_scores)):  # videos
+            video_idx = video2idx[video_metas[v_meta_idx]["vid_name"]]
+            cur_vcmr_redictions.append(
+                    {
+                    "video_name": video_metas[v_meta_idx]["vid_name"],
+                    "timestamp": [float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j])],
+                    "model_scores": float(v_score)
+                }
+            )
+        query_id=query_metas[i]["query_id"]
+        vcmr_res[query_id] = cur_vcmr_redictions[:maxtopk]
+    return vcmr_res
+def get_eval_res(model,  eval_dataset, context_data, opt, maxtopk):
+    """compute and save query and video proposal embeddings"""
+    video_metas, context_info = compute_context_info(model, context_data, opt)
+    eval_res = compute_query2ctx_info(model, eval_dataset, opt, video_metas, context_info,
+                                    max_before_nms=opt.max_before_nms, max_n_videos=opt.max_vcmr_video, maxtopk=maxtopk)
+    return eval_res
+POST_PROCESSING_MMS_FUNC = {
+    "SVMR": post_processing_svmr_nms,
+    "VCMR": post_processing_vcmr_nms
+}
+# def get_prediction_top_n(list_dict_predictions, top_n):
+#     top_n_res = []
+#     for e in list_dict_predictions:
+#         e["predictions"] = e["predictions"][:top_n]
+#         top_n_res.append(e)
+#     return top_n_res
+def eval_epoch(model, eval_dataset, context_data, logger, opt, max_after_nms, iou_thds, topks):
+    """max_after_nms: always set to 100, since the eval script only evaluate top-100"""
+    # IOU_THDS = (0.3, 0.5, 0.7)
+    model.eval()
+    pred_data = get_eval_res(model, eval_dataset, context_data, opt, max(topks))
+    # pred_data = get_prediction_top_n(eval_res, top_n=max_after_nms)
+    gt_data = eval_dataset.ground_truth
+    average_ndcg = calculate_ndcg_iou(gt_data, pred_data, iou_thds, topks)
+    return average_ndcg, pred_data
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    loaded_model_cfg = checkpoint["model_cfg"]
+    loaded_model_cfg["stack_conv_predictor_conv_kernel_sizes"] = -1
+    model = XML(loaded_model_cfg)
+    model.load_state_dict(checkpoint["model"])
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+    assert opt.eval_path is not None
+    eval_dataset = StartEndEvalDataset(
+        dset_name=opt.dset_name,
+        eval_split_name=opt.eval_split_name,  # should only be val set
+        data_path=opt.eval_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        corpus_path=opt.corpus_path,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        data_mode="query",
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat
+    )
+    model = setup_model(opt)
+    save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format(
+        opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks))
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename,
+                       tasks=opt.tasks, max_after_nms=100)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+if __name__ == '__main__':
+    start_inference()

baselines/crossmodal_moment_localization/model_components.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DepthwiseSeparableConv(nn.Module):
+    """
+    Depth-wise separable convolution uses less parameters to generate output by convolution.
+    :Examples:
+        >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1)
+        >>> input_tensor = torch.randn(32, 300, 20)
+        >>> output = m(input_tensor)
+    """
+    def __init__(self, in_ch, out_ch, k, dim=1, relu=True):
+        """
+        :param in_ch: input hidden dimension size
+        :param out_ch: output hidden dimension size
+        :param k: kernel size
+        :param dim: default 1. 1D conv or 2D conv
+        """
+        super(DepthwiseSeparableConv, self).__init__()
+        self.relu = relu
+        if dim == 1:
+            self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch,
+                                            kernel_size=k, groups=in_ch, padding=k//2)
+            self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch,
+                                            kernel_size=1, padding=0)
+        elif dim == 2:
+            self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch,
+                                            kernel_size=k, groups=in_ch, padding=k//2)
+            self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch,
+                                            kernel_size=1, padding=0)
+        else:
+            raise Exception("Incorrect dimension!")
+    def forward(self, x):
+        """
+        :Input: (N, L_in, D)
+        :Output: (N, L_out, D)
+        """
+        x = x.transpose(1, 2)
+        if self.relu:
+            out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True)
+        else:
+            out = self.pointwise_conv(self.depthwise_conv(x))
+        return out.transpose(1, 2)  # (N, L, D)
+class ConvEncoder(nn.Module):
+    def __init__(self, kernel_size=7, n_filters=128, dropout=0.1):
+        super(ConvEncoder, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(n_filters)
+        self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True)
+    def forward(self, x, mask):
+        """
+        :param x: (N, L, D)
+        :param mask: (N, L), is not used.
+        :return: (N, L, D)
+        """
+        return self.layer_norm(self.dropout(self.conv(x)) + x)  # (N, L, D)
+class TrainablePositionalEncoding(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, max_position_embeddings, hidden_size, dropout=0.1):
+        super(TrainablePositionalEncoding, self).__init__()
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input_feat):
+        """
+        Args:
+            input_feat: (N, L, D)
+        """
+        bsz, seq_length = input_feat.shape[:2]
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device)
+        position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = self.LayerNorm(input_feat + position_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class PositionEncoding(nn.Module):
+    """
+    Add positional information to input tensor.
+    :Examples:
+        >>> model = PositionEncoding(n_filters=6, max_len=10)
+        >>> test_input1 = torch.zeros(3, 10, 6)
+        >>> output1 = model(test_input1)
+        >>> output1.size()
+        >>> test_input2 = torch.zeros(5, 3, 9, 6)
+        >>> output2 = model(test_input2)
+        >>> output2.size()
+    """
+    def __init__(self, n_filters=128, max_len=500, pe_type="cosine"):
+        """
+        :param n_filters: same with input hidden size
+        :param max_len: maximum sequence length
+        :param pe_type: cosine or linear or None
+        """
+        super(PositionEncoding, self).__init__()
+        self.pe_type = pe_type
+        if pe_type != "none":
+            position = torch.arange(0, max_len).float().unsqueeze(1)
+            if pe_type == "cosine":
+                # Compute the positional encodings once in log space.
+                pe = torch.zeros(max_len, n_filters)  # (L, D)
+                div_term = torch.exp(torch.arange(0, n_filters, 2).float() * - (math.log(10000.0) / n_filters))
+                pe[:, 0::2] = torch.sin(position * div_term)
+                pe[:, 1::2] = torch.cos(position * div_term)
+            elif pe_type == "linear":
+                pe = position / max_len
+            else:
+                raise ValueError
+            self.register_buffer("pe", pe)  # buffer is a tensor, not a variable, (L, D)
+    def forward(self, x):
+        """
+        :Input: (*, L, D)
+        :Output: (*, L, D) the same size as input
+        """
+        if self.pe_type != "none":
+            pe = self.pe.data[:x.size(-2), :]  # (#x.size(-2), n_filters)
+            extra_dim = len(x.size()) - 2
+            for _ in range(extra_dim):
+                pe = pe.unsqueeze(0)
+            x = x + pe
+        return x
+class LinearLayer(nn.Module):
+    """linear layer configurable with layer normalization, dropout, ReLU."""
+    def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True):
+        super(LinearLayer, self).__init__()
+        self.relu = relu
+        self.layer_norm = layer_norm
+        if layer_norm:
+            self.LayerNorm = nn.LayerNorm(in_hsz)
+        layers = [
+            nn.Dropout(dropout),
+            nn.Linear(in_hsz, out_hsz)
+        ]
+        self.net = nn.Sequential(*layers)
+    def forward(self, x):
+        """(N, L, D)"""
+        if self.layer_norm:
+            x = self.LayerNorm(x)
+        x = self.net(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x  # (N, L, D)
+bert_config = dict(
+    hidden_size=768,
+    intermediate_size=768,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    num_attention_heads=4,
+)
+class BertLayer(nn.Module):
+    def __init__(self, config, use_self_attention=True):
+        super(BertLayer, self).__init__()
+        self.use_self_attention = use_self_attention
+        if use_self_attention:
+            self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask):
+        """
+        Args:
+            hidden_states:  (N, L, D)
+            attention_mask:  (N, L) with 1 indicate valid, 0 indicates invalid
+        Returns:
+        """
+        if self.use_self_attention:
+            attention_output = self.attention(hidden_states, attention_mask)
+        else:
+            attention_output = hidden_states
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(self, input_tensor, attention_mask):
+        """
+        Args:
+            input_tensor: (N, L, D)
+            attention_mask: (N, L)
+        Returns:
+        """
+        self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Sequential(
+            nn.Linear(config.hidden_size, config.intermediate_size),
+            nn.ReLU(True))
+    def forward(self, hidden_states):
+        return self.dense(hidden_states)
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  # (N, L, nh, dh)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)  # (N, nh, L, dh)
+    def forward(self, query_states, key_states, value_states, attention_mask):
+        """
+        Args:
+            query_states: (N, Lq, D)
+            key_states: (N, L, D)
+            value_states: (N, L, D)
+            attention_mask: (N, Lq, L)
+        Returns:
+        """
+        # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last)
+        # will be ignored in future computation anyway
+        attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000.  # (N, 1, Lq, L)
+        mixed_query_layer = self.query(query_states)
+        mixed_key_layer = self.key(key_states)
+        mixed_value_layer = self.value(value_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)  # (N, nh, Lq, dh)
+        key_layer = self.transpose_for_scores(mixed_key_layer)  # (N, nh, L, dh)
+        value_layer = self.transpose_for_scores(mixed_value_layer)  # (N, nh, L, dh)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))  # (N, nh, Lq, L)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states

baselines/crossmodal_moment_localization/model_xml.py ADDED Viewed

	@@ -0,0 +1,642 @@

+import math
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from easydict import EasyDict as edict
+from baselines.crossmodal_moment_localization.model_components import \
+    BertAttention, PositionEncoding, LinearLayer, BertSelfAttention, TrainablePositionalEncoding, ConvEncoder
+from utils.model_utils import RNNEncoder
+base_bert_layer_config = dict(
+    hidden_size=768,
+    intermediate_size=768,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    num_attention_heads=4,
+)
+xml_base_config = edict(
+    merge_two_stream=True,  # merge only the scores
+    cross_att=True,  # cross-attention for video and subtitles
+    span_predictor_type="conv",
+    encoder_type="transformer",  # cnn, transformer, lstm, gru
+    add_pe_rnn=False,  # add positional encoding for RNNs, (LSTM and GRU)
+    visual_input_size=2048,  # changes based on visual input type
+    query_input_size=768,
+    sub_input_size=768,
+    hidden_size=500,  #
+    conv_kernel_size=5,  # conv kernel_size for st_ed predictor
+    stack_conv_predictor_conv_kernel_sizes=-1,  # Do not use
+    conv_stride=1,  #
+    max_ctx_l=100,
+    max_desc_l=30,
+    input_drop=0.1,  # dropout for input
+    drop=0.1,  # dropout for other layers
+    n_heads=4,  # self attention heads
+    ctx_mode="video_sub",  # which context are used. 'video', 'sub' or 'video_sub'
+    margin=0.1,  # margin for ranking loss
+    ranking_loss_type="hinge",  # loss type, 'hinge' or 'lse'
+    lw_neg_q=1,  # loss weight for neg. query and pos. context
+    lw_neg_ctx=1,  # loss weight for pos. query and neg. context
+    lw_st_ed=1,  # loss weight for st ed prediction
+    use_hard_negative=False,  # use hard negative at video level, we may change it during training.
+    hard_pool_size=20,
+    use_self_attention=True,
+    no_modular=False,
+    pe_type="none",  # no positional encoding
+    initializer_range=0.02,
+)
+class XML(nn.Module):
+    def __init__(self, config):
+        super(XML, self).__init__()
+        self.config = config
+        # self.position_embeddings = PositionEncoding(n_filters=config.hidden_size,
+        #                                             max_len=config.max_position_embeddings,
+        #                                             pe_type=config.pe_type)
+        self.query_pos_embed = TrainablePositionalEncoding(
+            max_position_embeddings=config.max_desc_l,
+            hidden_size=config.hidden_size, dropout=config.input_drop)
+        self.ctx_pos_embed = TrainablePositionalEncoding(
+            max_position_embeddings=config.max_ctx_l,
+            hidden_size=config.hidden_size, dropout=config.input_drop)
+        self.query_input_proj = LinearLayer(config.query_input_size,
+                                            config.hidden_size,
+                                            layer_norm=True,
+                                            dropout=config.input_drop,
+                                            relu=True)
+        if config.encoder_type == "transformer":  # self-att encoder
+            self.query_encoder = BertAttention(edict(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.hidden_size,
+                hidden_dropout_prob=config.drop,
+                attention_probs_dropout_prob=config.drop,
+                num_attention_heads=config.n_heads,
+            ))
+        elif config.encoder_type == "cnn":
+            self.query_encoder = ConvEncoder(
+                kernel_size=5,
+                n_filters=config.hidden_size,
+                dropout=config.drop
+            )
+        elif config.encoder_type in ["gru", "lstm"]:
+            self.query_encoder = RNNEncoder(
+                word_embedding_size=config.hidden_size,
+                hidden_size=config.hidden_size // 2,
+                bidirectional=True,
+                n_layers=1,
+                rnn_type=config.encoder_type,
+                return_outputs=True,
+                return_hidden=False
+            )
+        conv_cfg = dict(in_channels=1,
+                        out_channels=1,
+                        kernel_size=config.conv_kernel_size,
+                        stride=config.conv_stride,
+                        padding=config.conv_kernel_size // 2,
+                        bias=False)
+        cross_att_cfg = edict(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.n_heads,
+            attention_probs_dropout_prob=config.drop
+        )
+        self.use_video = "video" in config.ctx_mode
+        if self.use_video:
+            self.video_input_proj = LinearLayer(config.visual_input_size,
+                                                config.hidden_size,
+                                                layer_norm=True,
+                                                dropout=config.input_drop,
+                                                relu=True)
+            self.video_encoder1 = copy.deepcopy(self.query_encoder)
+            self.video_encoder2 = copy.deepcopy(self.query_encoder)
+            if self.config.cross_att:
+                self.video_cross_att = BertSelfAttention(cross_att_cfg)
+                self.video_cross_layernorm = nn.LayerNorm(config.hidden_size)
+            else:
+                if self.config.encoder_type == "transformer":
+                    self.video_encoder3 = copy.deepcopy(self.query_encoder)
+            self.video_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
+            if config.span_predictor_type == "conv":
+                if not config.merge_two_stream:
+                    self.video_st_predictor = nn.Conv1d(**conv_cfg)
+                    self.video_ed_predictor = nn.Conv1d(**conv_cfg)
+            elif config.span_predictor_type == "cat_linear":
+                self.video_st_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+                self.video_ed_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+        self.use_sub = "sub" in config.ctx_mode
+        if self.use_sub:
+            self.sub_input_proj = LinearLayer(config.sub_input_size,
+                                              config.hidden_size,
+                                              layer_norm=True,
+                                              dropout=config.input_drop,
+                                              relu=True)
+            self.sub_encoder1 = copy.deepcopy(self.query_encoder)
+            self.sub_encoder2 = copy.deepcopy(self.query_encoder)
+            if self.config.cross_att:
+                self.sub_cross_att = BertSelfAttention(cross_att_cfg)
+                self.sub_cross_layernorm = nn.LayerNorm(config.hidden_size)
+            else:
+                if self.config.encoder_type == "transformer":
+                    self.sub_encoder3 = copy.deepcopy(self.query_encoder)
+            self.sub_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
+            if config.span_predictor_type == "conv":
+                if not config.merge_two_stream:
+                    self.sub_st_predictor = nn.Conv1d(**conv_cfg)
+                    self.sub_ed_predictor = nn.Conv1d(**conv_cfg)
+            elif config.span_predictor_type == "cat_linear":
+                self.sub_st_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+                self.sub_ed_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+        self.modular_vector_mapping = nn.Linear(in_features=config.hidden_size,
+                                                out_features=self.use_sub + self.use_video,
+                                                bias=False)
+        self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean")
+        if config.merge_two_stream and config.span_predictor_type == "conv":
+            if self.config.stack_conv_predictor_conv_kernel_sizes == -1:
+                self.merged_st_predictor = nn.Conv1d(**conv_cfg)
+                self.merged_ed_predictor = nn.Conv1d(**conv_cfg)
+            else:
+                print("Will be using  multiple Conv layers for prediction.")
+                self.merged_st_predictors = nn.ModuleList()
+                self.merged_ed_predictors = nn.ModuleList()
+                num_convs = len(self.config.stack_conv_predictor_conv_kernel_sizes)
+                for k in self.config.stack_conv_predictor_conv_kernel_sizes:
+                    conv_cfg = dict(in_channels=1,
+                                    out_channels=1,
+                                    kernel_size=k,
+                                    stride=config.conv_stride,
+                                    padding=k // 2,
+                                    bias=False)
+                    self.merged_st_predictors.append(nn.Conv1d(**conv_cfg))
+                    self.merged_ed_predictors.append(nn.Conv1d(**conv_cfg))
+                self.combine_st_conv = nn.Linear(num_convs, 1, bias=False)
+                self.combine_ed_conv = nn.Linear(num_convs, 1, bias=False)
+        self.reset_parameters()
+    def reset_parameters(self):
+        """ Initialize the weights."""
+        def re_init(module):
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                # Slightly different from the TF version which uses truncated_normal for initialization
+                # cf https://github.com/pytorch/pytorch/pull/5617
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            elif isinstance(module, nn.Conv1d):
+                module.reset_parameters()
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        self.apply(re_init)
+    def set_hard_negative(self, use_hard_negative, hard_pool_size):
+        """use_hard_negative: bool; hard_pool_size: int, """
+        self.config.use_hard_negative = use_hard_negative
+        self.config.hard_pool_size = hard_pool_size
+    def set_train_st_ed(self, lw_st_ed):
+        """pre-train video retrieval then span prediction"""
+        self.config.lw_st_ed = lw_st_ed
+    def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask,
+                tef_feat, tef_mask, st_ed_indices):
+        """
+        Args:
+            query_feat: (N, Lq, Dq)
+            query_mask: (N, Lq)
+            video_feat: (N, Lv, Dv) or None
+            video_mask: (N, Lv) or None
+            sub_feat: (N, Lv, Ds) or None
+            sub_mask: (N, Lv) or None
+            tef_feat: (N, Lv, 2) or None,
+            tef_mask: (N, Lv) or None,
+            st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively.
+        """
+        video_feat1, video_feat2, sub_feat1, sub_feat2 = \
+            self.encode_context(video_feat, video_mask, sub_feat, sub_mask)
+        query_context_scores, st_prob, ed_prob = \
+            self.get_pred_from_raw_query(query_feat, query_mask,
+                                         video_feat1, video_feat2, video_mask,
+                                         sub_feat1, sub_feat2, sub_mask, cross=False)
+        loss_st_ed = 0
+        if self.config.lw_st_ed != 0:
+            loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0])
+            loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1])
+            loss_st_ed = loss_st + loss_ed
+        loss_neg_ctx, loss_neg_q = 0, 0
+        if self.config.lw_neg_ctx != 0 or self.config.lw_neg_q != 0:
+            loss_neg_ctx, loss_neg_q = self.get_video_level_loss(query_context_scores)
+        loss_st_ed = self.config.lw_st_ed * loss_st_ed
+        loss_neg_ctx = self.config.lw_neg_ctx * loss_neg_ctx
+        loss_neg_q = self.config.lw_neg_q * loss_neg_q
+        loss = loss_st_ed + loss_neg_ctx + loss_neg_q
+        return loss, {"loss_st_ed": float(loss_st_ed),
+                      "loss_neg_ctx": float(loss_neg_ctx),
+                      "loss_neg_q": float(loss_neg_q),
+                      "loss_overall": float(loss)}
+    def get_visualization_data(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask,
+                               tef_feat, tef_mask, st_ed_indices):
+        assert self.config.merge_two_stream and self.use_video and self.use_sub and not self.config.no_modular
+        video_feat1, video_feat2, sub_feat1, sub_feat2 = \
+            self.encode_context(video_feat, video_mask, sub_feat, sub_mask)
+        encoded_query = self.encode_input(query_feat, query_mask,
+                                          self.query_input_proj, self.query_encoder, self.query_pos_embed)  # (N, Lq, D)
+        # (N, D), (N, D), (N, L, 2)
+        video_query, sub_query, modular_att_scores = \
+            self.get_modularized_queries(encoded_query, query_mask, return_modular_att=True)
+        # (N, L), (N, L), (N, L)
+        st_prob, ed_prob, similarity_scores, video_similarity, sub_similarity = self.get_merged_st_ed_prob(
+            video_query, video_feat2, sub_query, sub_feat2, video_mask, cross=False, return_similaity=True)
+        # clean up invalid bits
+        data = dict(modular_att_scores=modular_att_scores.cpu().numpy(),  # (N, Lq, 2), row 0, 1 are video, sub.
+                    st_prob=st_prob.cpu().numpy(),  # (N, L)
+                    ed_prob=ed_prob.cpu().numpy(),  # (N, L)
+                    similarity_scores=similarity_scores.cpu().numpy(),  # (N, L)
+                    video_similarity=video_similarity.cpu().numpy(),  # (N, L)
+                    sub_similarity=sub_similarity.cpu().numpy(),  # (N, L)
+                    st_ed_indices=st_ed_indices.cpu().numpy())  # (N, L)
+        query_lengths = query_mask.sum(1).to(torch.long).cpu().tolist()  # (N, )
+        ctx_lengths = video_mask.sum(1).to(torch.long).cpu().tolist()  # (N, )
+        # print("query_lengths {}".format((type(query_lengths), len(query_lengths), query_lengths[:10])))
+        for k, v in data.items():
+            if k == "modular_att_scores":
+                # print(k, v, v.shape, type(v))
+                data[k] = [e[:l] for l, e in zip(query_lengths, v)]  # list(e) where e is  (Lq_i, 2)
+            else:
+                data[k] = [e[:l] for l, e in zip(ctx_lengths, v)]   # list(e) where e is (Lc_i)
+        # aggregate info for each example
+        datalist = []
+        for idx in range(len(data["modular_att_scores"])):
+            datalist.append({k: v[idx] for k, v in data.items()})
+        return datalist  # list(dicts) of length N
+    def encode_query(self, query_feat, query_mask):
+        encoded_query = self.encode_input(query_feat, query_mask,
+                                          self.query_input_proj, self.query_encoder, self.query_pos_embed)  # (N, Lq, D)
+        video_query, sub_query = self.get_modularized_queries(encoded_query, query_mask)  # (N, D) * 2
+        return video_query, sub_query
+    def non_cross_encode_context(self, context_feat, context_mask, module_name="video"):
+        encoder_layer3 = getattr(self, module_name + "_encoder3") \
+            if self.config.encoder_type == "transformer" else None
+        return self._non_cross_encode_context(context_feat, context_mask,
+                                              input_proj_layer=getattr(self, module_name + "_input_proj"),
+                                              encoder_layer1=getattr(self, module_name + "_encoder1"),
+                                              encoder_layer2=getattr(self, module_name + "_encoder2"),
+                                              encoder_layer3=encoder_layer3)
+    def _non_cross_encode_context(self, context_feat, context_mask, input_proj_layer,
+                                  encoder_layer1, encoder_layer2, encoder_layer3=None):
+        """
+        Args:
+            context_feat: (N, L, D)
+            context_mask: (N, L)
+            input_proj_layer:
+            encoder_layer1:
+            encoder_layer2:
+            encoder_layer3
+        """
+        context_feat1 = self.encode_input(
+            context_feat, context_mask, input_proj_layer, encoder_layer1, self.ctx_pos_embed)  # (N, L, D)
+        if self.config.encoder_type in ["transformer", "cnn"]:
+            context_mask = context_mask.unsqueeze(1)  # (N, 1, L), torch.FloatTensor
+            context_feat2 = encoder_layer2(context_feat1, context_mask)  # (N, L, D)
+            if self.config.encoder_type == "transformer":
+                context_feat2 = encoder_layer3(context_feat2, context_mask)
+        elif self.config.encoder_type in ["gru", "lstm"]:
+            context_mask = context_mask.sum(1).long()  # (N, ), torch.LongTensor
+            context_feat2 = encoder_layer2(context_feat1, context_mask)[0]  # (N, L, D)
+        else:
+            raise NotImplementedError
+        return context_feat1, context_feat2
+    def encode_context(self, video_feat, video_mask, sub_feat, sub_mask):
+        if self.config.cross_att:
+            assert self.use_video and self.use_sub
+            return self.cross_encode_context(video_feat, video_mask, sub_feat, sub_mask)
+        else:
+            video_feat1, video_feat2 = (None,) * 2
+            if self.use_video:
+                video_feat1, video_feat2 = self.non_cross_encode_context(video_feat, video_mask, module_name="video")
+            sub_feat1, sub_feat2 = (None,) * 2
+            if self.use_sub:
+                sub_feat1, sub_feat2 = self.non_cross_encode_context(sub_feat, sub_mask, module_name="sub")
+            return video_feat1, video_feat2, sub_feat1, sub_feat2
+    def cross_encode_context(self, video_feat, video_mask, sub_feat, sub_mask):
+        encoded_video_feat = self.encode_input(video_feat, video_mask,
+                                               self.video_input_proj, self.video_encoder1, self.ctx_pos_embed)
+        encoded_sub_feat = self.encode_input(sub_feat, sub_mask,
+                                             self.sub_input_proj, self.sub_encoder1, self.ctx_pos_embed)
+        x_encoded_video_feat = self.cross_context_encoder(
+            encoded_video_feat, video_mask, encoded_sub_feat, sub_mask,
+            self.video_cross_att, self.video_cross_layernorm, self.video_encoder2)  # (N, L, D)
+        x_encoded_sub_feat = self.cross_context_encoder(
+            encoded_sub_feat, sub_mask, encoded_video_feat, video_mask,
+            self.sub_cross_att, self.sub_cross_layernorm, self.sub_encoder2)  # (N, L, D)
+        return encoded_video_feat, x_encoded_video_feat, encoded_sub_feat, x_encoded_sub_feat
+    def cross_context_encoder(self, main_context_feat, main_context_mask, side_context_feat, side_context_mask,
+                              cross_att_layer, norm_layer, self_att_layer):
+        """
+        Args:
+            main_context_feat: (N, Lq, D)
+            main_context_mask: (N, Lq)
+            side_context_feat: (N, Lk, D)
+            side_context_mask: (N, Lk)
+            cross_att_layer:
+            norm_layer:
+            self_att_layer:
+        """
+        cross_mask = torch.einsum("bm,bn->bmn", main_context_mask, side_context_mask)  # (N, Lq, Lk)
+        cross_out = cross_att_layer(main_context_feat, side_context_feat, side_context_feat, cross_mask)  # (N, Lq, D)
+        residual_out = norm_layer(cross_out + main_context_feat)
+        if self.config.encoder_type in ["cnn", "transformer"]:
+            return self_att_layer(residual_out, main_context_mask.unsqueeze(1))
+        elif self.config.encoder_type in ["gru", "lstm"]:
+            return self_att_layer(residual_out, main_context_mask.sum(1).long())[0]
+    def encode_input(self, feat, mask, input_proj_layer, encoder_layer, pos_embed_layer):
+        """
+        Args:
+            feat: (N, L, D_input), torch.float32
+            mask: (N, L), torch.float32, with 1 indicates valid query, 0 indicates mask
+            input_proj_layer: down project input
+            encoder_layer: encoder layer
+            # add_pe: bool, whether to add positional encoding
+            pos_embed_layer
+        """
+        feat = input_proj_layer(feat)
+        if self.config.encoder_type in ["cnn", "transformer"]:
+            feat = pos_embed_layer(feat)
+            mask = mask.unsqueeze(1)  # (N, 1, L), torch.FloatTensor
+            return encoder_layer(feat, mask)  # (N, L, D_hidden)
+        elif self.config.encoder_type in ["gru", "lstm"]:
+            if self.config.add_pe_rnn:
+                feat = pos_embed_layer(feat)
+            mask = mask.sum(1).long()  # (N, ), torch.LongTensor
+            return encoder_layer(feat, mask)[0]  # (N, L, D_hidden)
+    def get_modularized_queries(self, encoded_query, query_mask, return_modular_att=False):
+        """
+        Args:
+            encoded_query: (N, L, D)
+            query_mask: (N, L)
+            return_modular_att: bool
+        """
+        if self.config.no_modular:
+            modular_query = torch.max(mask_logits(encoded_query, query_mask.unsqueeze(2)), dim=1)[0]  # (N, D)
+            return modular_query, modular_query  #
+        else:
+            modular_attention_scores = self.modular_vector_mapping(encoded_query)  # (N, L, 2 or 1)
+            modular_attention_scores = F.softmax(
+                mask_logits(modular_attention_scores, query_mask.unsqueeze(2)), dim=1)
+            # TODO check whether it is the same
+            modular_queries = torch.einsum("blm,bld->bmd",
+                                           modular_attention_scores, encoded_query)  # (N, 2 or 1, D)
+            if return_modular_att:
+                assert modular_queries.shape[1] == 2
+                return modular_queries[:, 0], modular_queries[:, 1], modular_attention_scores
+            else:
+                if modular_queries.shape[1] == 2:
+                    return modular_queries[:, 0], modular_queries[:, 1]  # (N, D) * 2
+                else:  # 1
+                    return modular_queries[:, 0], modular_queries[:, 0]  # the same
+    def get_modular_weights(self, encoded_query, query_mask):
+        """
+        Args:
+            encoded_query: (N, L, D)
+            query_mask: (N, L)
+        """
+        max_encoded_query, _ = torch.max(mask_logits(encoded_query, query_mask.unsqueeze(2)), dim=1)  # (N, D)
+        modular_weights = self.modular_weights_calculator(max_encoded_query)  # (N, 2)
+        modular_weights = F.softmax(modular_weights, dim=-1)
+        return modular_weights[:, 0:1], modular_weights[:, 1:2]  # (N, 1) * 2
+    def get_video_level_scores(self, modularied_query, context_feat1, context_mask):
+        """ Calculate video2query scores for each pair of video and query inside the batch.
+        Args:
+            modularied_query: (N, D)
+            context_feat1: (N, L, D), output of the first transformer encoder layer
+            context_mask: (N, L)
+        Returns:
+            context_query_scores: (N, N)  score of each query w.r.t. each video inside the batch,
+                diagonal positions are positive. used to get negative samples.
+        """
+        modularied_query = F.normalize(modularied_query, dim=-1)
+        context_feat1 = F.normalize(context_feat1, dim=-1)
+        query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat1)  # (N, L, N)
+        context_mask = context_mask.transpose(0, 1).unsqueeze(0)  # (1, L, N)
+        query_context_scores = mask_logits(query_context_scores, context_mask)  # (N, L, N)
+        query_context_scores, _ = torch.max(query_context_scores,
+                                            dim=1)  # (N, N) diagonal positions are positive pairs.
+        return query_context_scores
+    def get_merged_st_ed_prob(self, video_query, video_feat, sub_query, sub_feat, context_mask,
+                              cross=False, return_similaity=False):
+        """context_mask could be either video_mask or sub_mask, since they are the same"""
+        assert self.use_video and self.use_sub and self.config.span_predictor_type == "conv"
+        video_query = self.video_query_linear(video_query)
+        sub_query = self.sub_query_linear(sub_query)
+        stack_conv = self.config.stack_conv_predictor_conv_kernel_sizes != -1
+        num_convs = len(self.config.stack_conv_predictor_conv_kernel_sizes) if stack_conv else None
+        if cross:
+            video_similarity = torch.einsum("md,nld->mnl", video_query, video_feat)
+            sub_similarity = torch.einsum("md,nld->mnl", sub_query, sub_feat)
+            similarity = (video_similarity + sub_similarity) / 2  # (Nq, Nv, L)  from query to all videos.
+            n_q, n_c, l = similarity.shape
+            similarity = similarity.view(n_q * n_c, 1, l)
+            if not stack_conv:
+                st_prob = self.merged_st_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+                ed_prob = self.merged_ed_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+            else:
+                st_prob_list = []
+                ed_prob_list = []
+                for idx in range(num_convs):
+                    st_prob_list.append(self.merged_st_predictors[idx](similarity).squeeze().unsqueeze(2))
+                    ed_prob_list.append(self.merged_ed_predictors[idx](similarity).squeeze().unsqueeze(2))
+                # (Nq*Nv, L, 3) --> (Nq*Nv, L) -> (Nq, Nv, L)
+                st_prob = self.combine_st_conv(torch.cat(st_prob_list, dim=2)).view(n_q, n_c, l)
+                ed_prob = self.combine_ed_conv(torch.cat(ed_prob_list, dim=2)).view(n_q, n_c, l)
+        else:
+            video_similarity = torch.einsum("bd,bld->bl", video_query, video_feat)  # (N, L)
+            sub_similarity = torch.einsum("bd,bld->bl", sub_query, sub_feat)  # (N, L)
+            similarity = (video_similarity + sub_similarity) / 2
+            if not stack_conv:
+                st_prob = self.merged_st_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+                ed_prob = self.merged_ed_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+            else:
+                st_prob_list = []
+                ed_prob_list = []
+                for idx in range(num_convs):
+                    st_prob_list.append(self.merged_st_predictors[idx](similarity.unsqueeze(1)).squeeze().unsqueeze(2))
+                    ed_prob_list.append(self.merged_ed_predictors[idx](similarity.unsqueeze(1)).squeeze().unsqueeze(2))
+                st_prob = self.combine_st_conv(torch.cat(st_prob_list, dim=2)).squeeze()  # (N, L, 3) --> (N, L)
+                ed_prob = self.combine_ed_conv(torch.cat(ed_prob_list, dim=2)).squeeze()  # (N, L, 3) --> (N, L)
+        st_prob = mask_logits(st_prob, context_mask)  # (N, L)
+        ed_prob = mask_logits(ed_prob, context_mask)
+        if return_similaity:
+            assert not cross
+            return st_prob, ed_prob, similarity, video_similarity, sub_similarity
+        else:
+            return st_prob, ed_prob
+    def get_st_ed_prob(self, modularied_query, context_feat2, context_mask,
+                       module_name="video", cross=False):
+        return self._get_st_ed_prob(modularied_query, context_feat2, context_mask,
+                                    module_query_linear=getattr(self, module_name + "_query_linear"),
+                                    st_predictor=getattr(self, module_name + "_st_predictor"),
+                                    ed_predictor=getattr(self, module_name + "_ed_predictor"),
+                                    cross=cross)
+    def _get_st_ed_prob(self, modularied_query, context_feat2, context_mask,
+                        module_query_linear, st_predictor, ed_predictor, cross=False):
+        """
+        Args:
+            modularied_query: (N, D)
+            context_feat2: (N, L, D), output of the first transformer encoder layer
+            context_mask: (N, L)
+            module_query_linear:
+            st_predictor:
+            ed_predictor:
+            cross: at inference, calculate prob for each possible pairs of query and context.
+        """
+        query = module_query_linear(modularied_query)  # (N, D) no need to normalize here.
+        if cross:
+            if self.config.span_predictor_type == "conv":
+                similarity = torch.einsum("md,nld->mnl", query, context_feat2)  # (Nq, Nv, L)  from query to all videos.
+                n_q, n_c, l = similarity.shape
+                similarity = similarity.view(n_q * n_c, 1, l)
+                st_prob = st_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+                ed_prob = ed_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+            elif self.config.span_predictor_type == "cat_linear":
+                st_prob_q = st_predictor[0](query).unsqueeze(1)  # (Nq, 1, 1)
+                st_prob_ctx = st_predictor[1](context_feat2).squeeze().unsqueeze(0)  # (1, Nv, L)
+                st_prob = st_prob_q + st_prob_ctx  # (Nq, Nv, L)
+                ed_prob_q = ed_predictor[0](query).unsqueeze(1)  # (Nq, 1, 1)
+                ed_prob_ctx = ed_predictor[1](context_feat2).squeeze().unsqueeze(0)  # (1, Nv, L)
+                ed_prob = ed_prob_q + ed_prob_ctx  # (Nq, Nv, L)
+            context_mask = context_mask.unsqueeze(0)  # (1, Nv, L)
+        else:
+            if self.config.span_predictor_type == "conv":
+                similarity = torch.einsum("bd,bld->bl", query, context_feat2)  # (N, L)
+                st_prob = st_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+                ed_prob = ed_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+            elif self.config.span_predictor_type == "cat_linear":
+                # avoid concatenation by break into smaller matrix multiplications.
+                st_prob = st_predictor[0](query) + st_predictor[1](context_feat2).squeeze()  # (N, L)
+                ed_prob = ed_predictor[0](query) + ed_predictor[1](context_feat2).squeeze()  # (N, L)
+        st_prob = mask_logits(st_prob, context_mask)  # (N, L)
+        ed_prob = mask_logits(ed_prob, context_mask)
+        return st_prob, ed_prob
+    def get_pred_from_raw_query(self, query_feat, query_mask,
+                                video_feat1, video_feat2, video_mask,
+                                sub_feat1, sub_feat2, sub_mask, cross=False):
+        """
+        Args:
+            query_feat: (N, Lq, Dq)
+            query_mask: (N, Lq)
+            video_feat1: (N, Lv, D) or None
+            video_feat2:
+            video_mask: (N, Lv)
+            sub_feat1: (N, Lv, D) or None
+            sub_feat2:
+            sub_mask: (N, Lv)
+            cross:
+        """
+        video_query, sub_query = self.encode_query(query_feat, query_mask)
+        divisor = self.use_sub + self.use_video
+        # get video-level retrieval scores
+        video_q2ctx_scores = self.get_video_level_scores(video_query, video_feat1, video_mask) if self.use_video else 0
+        sub_q2ctx_scores = self.get_video_level_scores(sub_query, sub_feat1, sub_mask) if self.use_sub else 0
+        q2ctx_scores = (video_q2ctx_scores + sub_q2ctx_scores) / divisor  # (N, N)
+        if self.config.merge_two_stream and self.use_video and self.use_sub:
+            st_prob, ed_prob = self.get_merged_st_ed_prob(
+                video_query, video_feat2, sub_query, sub_feat2, video_mask, cross=cross)
+        else:
+            video_st_prob, video_ed_prob = self.get_st_ed_prob(
+                video_query, video_feat2, video_mask, module_name="video", cross=cross) if self.use_video else (0, 0)
+            sub_st_prob, sub_ed_prob = self.get_st_ed_prob(
+                sub_query, sub_feat2, sub_mask, module_name="sub", cross=cross) if self.use_sub else (0, 0)
+            st_prob = (video_st_prob + sub_st_prob) / divisor  # (N, Lv)
+            ed_prob = (video_ed_prob + sub_ed_prob) / divisor  # (N, Lv)
+        return q2ctx_scores, st_prob, ed_prob  # un-normalized masked probabilities!!!!!
+    def get_video_level_loss(self, query_context_scores):
+        """ ranking loss between (pos. query + pos. video) and (pos. query + neg. video) or (neg. query + pos. video)
+        Args:
+            query_context_scores: (N, N), cosine similarity [-1, 1],
+                Each row contains the scores between the query to each of the videos inside the batch.
+        """
+        bsz = len(query_context_scores)
+        diagonal_indices = torch.arange(bsz).to(query_context_scores.device)
+        pos_scores = query_context_scores[diagonal_indices, diagonal_indices]  # (N, )
+        query_context_scores_masked = copy.deepcopy(query_context_scores.data)
+        # impossibly large for cosine similarity, the copy is created as modifying the original will cause error
+        query_context_scores_masked[diagonal_indices, diagonal_indices] = 999
+        pos_query_neg_context_scores = self.get_neg_scores(query_context_scores,
+                                                           query_context_scores_masked)
+        neg_query_pos_context_scores = self.get_neg_scores(query_context_scores.transpose(0, 1),
+                                                           query_context_scores_masked.transpose(0, 1))
+        loss_neg_ctx = self.get_ranking_loss(pos_scores, pos_query_neg_context_scores)
+        loss_neg_q = self.get_ranking_loss(pos_scores, neg_query_pos_context_scores)
+        return loss_neg_ctx, loss_neg_q
+    def get_neg_scores(self, scores, scores_masked):
+        """
+        scores: (N, N), cosine similarity [-1, 1],
+            Each row are scores: query --> all videos. Transposed version: video --> all queries.
+        scores_masked: (N, N) the same as scores, except that the diagonal (positive) positions
+            are masked with a large value.
+        """
+        bsz = len(scores)
+        batch_indices = torch.arange(bsz).to(scores.device)
+        _, sorted_scores_indices = torch.sort(scores_masked, descending=True, dim=1)
+        sample_min_idx = 1  # skip the masked positive
+        sample_max_idx = min(sample_min_idx + self.config.hard_pool_size, bsz) \
+            if self.config.use_hard_negative else bsz
+        sampled_neg_score_indices = sorted_scores_indices[
+            batch_indices, torch.randint(sample_min_idx, sample_max_idx, size=(bsz,)).to(scores.device)]  # (N, )
+        sampled_neg_scores = scores[batch_indices, sampled_neg_score_indices]  # (N, )
+        return sampled_neg_scores
+    def get_ranking_loss(self, pos_score, neg_score):
+        """ Note here we encourage positive scores to be larger than negative scores.
+        Args:
+            pos_score: (N, ), torch.float32
+            neg_score: (N, ), torch.float32
+        """
+        if self.config.ranking_loss_type == "hinge":  # max(0, m + S_neg - S_pos)
+            return torch.clamp(self.config.margin + neg_score - pos_score, min=0).sum() / len(pos_score)
+        elif self.config.ranking_loss_type == "lse":  # log[1 + exp(S_neg - S_pos)]
+            return torch.log1p(torch.exp(neg_score - pos_score)).sum() / len(pos_score)
+        else:
+            raise NotImplementedError("Only support 'hinge' and 'lse'")
+def mask_logits(target, mask):
+    return target * mask + (1 - mask) * (-1e10)

baselines/crossmodal_moment_localization/ndcg_iou_topk.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from utils.basic_utils import load_jsonl, save_jsonl, load_json
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from collections import defaultdict
+import copy
+def calculate_iou(pred_start: float, pred_end: float, gt_start: float, gt_end: float) -> float:
+    intersection_start = max(pred_start, gt_start)
+    intersection_end = min(pred_end, gt_end)
+    intersection = max(0, intersection_end - intersection_start)
+    union = (pred_end - pred_start) + (gt_end - gt_start) - intersection
+    return intersection / union if union > 0 else 0
+# Function to calculate DCG
+def calculate_dcg(scores):
+    return sum((2**score - 1) / np.log2(idx + 2) for idx, score in enumerate(scores))
+# Function to calculate NDCG
+def calculate_ndcg(pred_scores, true_scores):
+    dcg = calculate_dcg(pred_scores)
+    idcg = calculate_dcg(sorted(true_scores, reverse=True))
+    return dcg / idcg if idcg > 0 else 0
+def calculate_ndcg_iou(all_gt, all_pred, TS, KS):
+    performance = defaultdict(lambda: defaultdict(list))
+    performance_avg = defaultdict(lambda: defaultdict(float))
+    for k in tqdm(all_pred.keys(), desc="Calculate NDCG"):
+        one_pred = all_pred[k]
+        one_gt = all_gt[k]
+        one_gt.sort(key=lambda x: x["relevance"], reverse=True)
+        for T in TS:
+            one_gt_drop = copy.deepcopy(one_gt)
+            predictions_with_scores = []
+            for pred in one_pred:
+                pred_video_name, pred_time = pred["video_name"], pred["timestamp"]
+                matched_rows = [gt for gt in one_gt_drop if gt["video_name"] == pred_video_name]
+                if not matched_rows:
+                    pred["pred_relevance"] = 0
+                else:
+                    ious = [calculate_iou(pred_time[0], pred_time[1], gt["timestamp"][0], gt["timestamp"][1]) for gt in matched_rows]
+                    max_iou_idx = np.argmax(ious)
+                    max_iou_row = matched_rows[max_iou_idx]
+                    if ious[max_iou_idx] > T:
+                        pred["pred_relevance"] = max_iou_row["relevance"]
+                        # Remove the matched ground truth row
+                        original_idx = one_gt_drop.index(max_iou_row)
+                        one_gt_drop.pop(original_idx)
+                    else:
+                        pred["pred_relevance"] = 0
+                predictions_with_scores.append(pred)
+            for K in KS:
+                true_scores = [gt["relevance"] for gt in one_gt][:K]
+                pred_scores = [pred["pred_relevance"] for pred in predictions_with_scores][:K]
+                ndcg_score = calculate_ndcg(pred_scores, true_scores)
+                performance[K][T].append(ndcg_score)
+    for K, vs in performance.items():
+        for T, v in vs.items():
+            performance_avg[K][T] = np.mean(v)
+    return performance_avg

baselines/crossmodal_moment_localization/optimization.py ADDED Viewed

	@@ -0,0 +1,338 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+import logging
+import abc
+import sys
+logger = logging.getLogger(__name__)
+if sys.version_info >= (3, 4):
+    ABC = abc.ABC
+else:
+    ABC = abc.ABCMeta('ABC', (), {})
+class _LRSchedule(ABC):
+    """ Parent of all LRSchedules here. """
+    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        """
+        :param warmup:  what fraction of t_total steps will be used for linear warmup
+        :param t_total: how many training steps (updates) are planned
+        :param kw:
+        """
+        super(_LRSchedule, self).__init__(**kw)
+        if t_total < 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        warmup = max(warmup, 0.)
+        self.warmup, self.t_total = float(warmup), float(t_total)
+        self.warned_for_t_total_at_progress = -1
+    def get_lr(self, step, nowarn=False):
+        """
+        :param step:    which of t_total steps we're on
+        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+        :return:        learning rate multiplier for current update
+        """
+        if self.t_total < 0:
+            return 1.
+        progress = float(step) / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning(
+                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+                    .format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+    @abc.abstractmethod
+    def get_lr_(self, progress):
+        """
+        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+        :return:            learning rate multiplier for current update
+        """
+        return 1.
+class ConstantLR(_LRSchedule):
+    def get_lr_(self, progress):
+        return 1.
+class WarmupCosineSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        """
+        :param warmup:      see LRSchedule
+        :param t_total:     see LRSchedule
+        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+        :param kw:
+        """
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        assert(cycles >= 1.)
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
+            return ret
+class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+    """
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        assert(warmup * cycles < 1.)
+        warmup = warmup * cycles if warmup >= 0 else warmup
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+    def get_lr_(self, progress):
+        progress = progress * self.cycles % 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * progress))
+            return ret
+class WarmupConstantSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    """
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+class WarmupLinearSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    """
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0.)
+SCHEDULES = {
+    None:       ConstantLR,
+    "none":     ConstantLR,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
+}
+class EMA(object):
+    """ Exponential Moving Average for model parameters.
+    references:
+    [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py
+    [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py"""
+    def __init__(self, decay):
+        self.decay = decay
+        self.shadow = {}
+        self.original = {}
+    def register(self, name, val):
+        self.shadow[name] = val.clone()
+    def __call__(self, model, step):
+        decay = min(self.decay,  (1 + step) / (10.0 + step))
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                new_average = \
+                    (1.0 - decay) * param.data + decay * self.shadow[name]
+                self.shadow[name] = new_average.clone()
+    def assign(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                self.original[name] = param.data.clone()
+                param.data = self.shadow[name]
+    def resume(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                param.data = self.original[name]
+class BertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
+        schedule: schedule to use for the warmup (see above).
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(BertAdam, self).__init__(params, defaults)
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                lr.append(lr_scheduled)
+        return lr
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['next_m'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['next_v'] = torch.zeros_like(p.data)
+                next_m, next_v = state['next_m'], state['next_v']
+                beta1, beta2 = group['b1'], group['b2']
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                next_m.mul_(beta1).add_(grad, alpha=1 - beta1)
+                next_v.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                update = next_m / (next_v.sqrt() + group['e'])
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                update_with_lr = lr_scheduled * update
+                p.data.add_(-update_with_lr)
+                state['step'] += 1
+                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+                # No bias correction
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+        return loss

baselines/crossmodal_moment_localization/scripts/eval.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/eval.sh ANY_OTHER_PYTHON_ARGS
+eval_split_name=$1
+submission_path=$2
+save_path=$3
+gt_path=data/tvr_${eval_split_name}_release.jsonl
+python standalone_eval/eval.py \
+--gt_path ${gt_path} \
+--submission_path ${submission_path} \
+--save_path ${save_path} \
+${@:4}

baselines/crossmodal_moment_localization/scripts/inference.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+model_dir=$1
+eval_split_name=$2
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+tasks=()
+tasks+=(VCMR)
+tasks+=(SVMR)
+tasks+=(VR)
+echo "tasks ${tasks[@]}"
+python baselines/crossmodal_moment_localization/inference.py \
+--model_dir ${model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+${@:3}

baselines/crossmodal_moment_localization/scripts/inference_with_external.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/inference_with_external.sh
+#model_dir=$1
+# DO not use NMS, since it gives worse results
+eval_model=$1  # [xml, xml_tef]
+eval_split_name=$2
+external_model=mee  # [mee, mcn, cal]
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+project_root=./baselines
+# setup eval model
+if [[ ${eval_model} == xml ]]; then
+    eval_model_dir=tvr-video_sub-resnet_i3d_no_norm_v-2019_11_03_12_22_19
+elif [[ ${eval_model} == xml_tef ]]; then
+    eval_model_dir=tvr-video_sub_tef-resnet_i3d_no_norm_v-2019_11_03_12_53_01
+fi
+# setup external
+if [[ ${external_model} == mee ]]; then
+    external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39
+    external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json
+fi
+tasks=(VR)
+tasks+=(SVMR)
+tasks+=(VCMR)
+echo "tasks ${tasks[@]}"
+python baselines/crossmodal_moment_localization/inference.py \
+--model_dir ${eval_model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+--external_inference_vr_res_path ${external_inference_vr_res_path} \
+--eval_id ${external_model_dir} \
+${@:3}
+#--use_intermediate \  # temporary removed

baselines/crossmodal_moment_localization/scripts/train.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS
+# use --eval_tasks_at_training ["VR", "SVMR", "VCMR"] --stop_task ["VR", "SVMR", "VCMR"] for
+# use --lw_neg_q 0 --lw_neg_ctx 0 for training SVMR/SVMR only
+# use --lw_st_ed 0 for training with VR only
+dset_name=$1  # see case below
+ctx_mode=$2  # [video, sub, tef, video_sub, video_tef, sub_tef, video_sub_tef]
+vid_feat_type=$3  # [resnet, i3d, resnet_i3d]
+feature_root=data/tvr_feature_release
+results_root=baselines/crossmodal_moment_localization/results
+vid_feat_size=2048
+extra_args=()
+if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+    if [[ ${dset_name} != "tvr" ]]; then
+        echo "The use of subtitles is only supported in tvr."
+        exit 1
+    fi
+fi
+case ${dset_name} in
+    tvr)
+        train_path=data/tvr_train_release.jsonl
+        corpus_path=data/tvr_video2dur_idx.json
+        desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5
+        if [[ ${vid_feat_type} == "i3d" ]]; then
+            echo "Using I3D feature with shape 1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5
+            vid_feat_size=1024
+        elif [[ ${vid_feat_type} == "resnet" ]]; then
+            echo "Using ResNet feature with shape 2048"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+            vid_feat_size=2048
+        elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then
+            echo "Using concatenated ResNet and I3D feature with shape 2048+1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5
+            vid_feat_size=3072
+            extra_args+=(--no_norm_vfeat)  # since they are already normalized.
+        fi
+        eval_split_name=val
+        nms_thd=-1
+        extra_args+=(--eval_path)
+        extra_args+=(data/tvr_val_release.jsonl)
+        clip_length=1.5
+        extra_args+=(--max_ctx_l)
+        extra_args+=(100)  # max_ctx_l = 100 for clip_length = 1.5, only ~109/21825 has more than 100.
+        extra_args+=(--max_pred_l)
+        extra_args+=(16)
+        if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+            echo "Running with sub."
+            desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5  # overwrite
+            sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5
+            sub_feat_size=768
+            extra_args+=(--sub_feat_size)
+            extra_args+=(${sub_feat_size})
+            extra_args+=(--sub_bert_path)
+            extra_args+=(${sub_bert_path})
+        fi
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]"
+echo "Extra args ${extra_args[@]}"
+echo " python baselines/crossmodal_moment_localization/train.py     --dset_name=${dset_name}     --eval_split_name=${eval_split_name}     --nms_thd=${nms_thd}     --results_root=${results_root}     --train_path=${train_path}     --desc_bert_path=${desc_bert_path}     --corpus_path=${corpus_path}     --vid_feat_path=${vid_feat_path}     --clip_length=${clip_length}     --vid_feat_size=${vid_feat_size}     --ctx_mode=${ctx_mode}     ${extra_args[@]}     ${@:4}"

baselines/crossmodal_moment_localization/start_end_dataset.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+Dataset for clip model
+"""
+import logging
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import h5py
+import time
+import math
+import random
+from tqdm import tqdm
+from utils.basic_utils import load_json, load_json, l2_normalize_np_array, flat_list_of_lists, merge_dicts
+from utils.tensor_utils import pad_sequences_1d
+from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \
+    get_didemo_agreed_ts
+import pandas as pd
+logger = logging.getLogger(__name__)
+class StartEndDataset(Dataset):
+    """
+    Args:
+        dset_name, str, ["tvr"]
+        ctx_mode: str,
+    Return:
+        a dict: {
+            "meta": {
+                "query_id": int,
+                "desc": str,
+                "vid_name": str,
+                "duration": float,
+                "ts": [st (float), ed (float)], seconds, ground_truth timestamps
+            }
+            "model_inputs": {
+                "query_feat": torch.tensor, (L, D_q)
+                "video_feat": torch.tensor, (n_clip_in_moment, D_video)
+                "sub_feat": torch.tensor, (n_clip_in_moment, D_sub)
+                "st_ed_indices": torch.LongTensor, (2, )
+            }
+        }
+    """
+    def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler,
+                 max_desc_len, max_ctx_len,
+                 vid_feat_path_or_handler, clip_length, ctx_mode="video",
+                 normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0):
+        self.dset_name = dset_name
+        self.data_path = data_path
+        self.data_ratio = data_ratio
+        self.desc_bert_path_or_handler = desc_bert_path_or_handler
+        self.max_desc_len = max_desc_len
+        self.sub_bert_path_or_handler = sub_bert_path_or_handler
+        self.max_ctx_len = max_ctx_len
+        self.vid_feat_path_or_handler = vid_feat_path_or_handler
+        self.clip_length = clip_length
+        self.ctx_mode = ctx_mode
+        # prepare desc data
+        self.data = self.expand_annotations(load_json(data_path))
+        if self.data_ratio != 1:
+            n_examples = int(len(self.data) * data_ratio)
+            self.data = self.data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+    def __len__(self):
+        return len(self.data)
+    def expand_annotations(self, annotations):
+        new_annotations = []
+        for i in annotations:
+            query = i["query"]
+            query_id = i["query_id"]
+            for moment in  i["relevant_moment"]:
+                moment.update({'query': query, 'query_id': query_id})
+                new_annotations.append(moment)
+        return new_annotations
+    def __getitem__(self, index):
+        raw_data = self.data[index]
+        # initialize with basic data
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["query"],
+            vid_name=raw_data["video_name"],
+            duration=raw_data["duration"],
+            ts=raw_data["timestamp"] ,
+        )
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+        ctx_l = 0
+        if self.use_video:
+            video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clip, D)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+            ctx_l = len(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clips, D_t)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+            ctx_l = len(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+        if self.use_tef:
+            # note the tef features here are normalized clip indices (1.5 secs), instead of the original time (1 sec)
+            ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
+            tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
+            tef_ed = tef_st + 1.0 / ctx_l
+            tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
+            model_inputs["tef_feat"] = tef
+        else:
+            model_inputs["tef_feat"] = torch.zeros((2, 2))
+        if self.use_video and self.use_tef:
+            model_inputs["video_feat"] = torch.cat(
+                [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D+2)
+        if self.use_sub and self.use_tef:
+            model_inputs["sub_feat"] = torch.cat(
+                [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D_t+2)
+        model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], max_idx=ctx_l-1)
+        return dict(meta=meta, model_inputs=model_inputs)
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+        Returns:
+            [st_idx, ed_idx]: int,
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        # TODO which one is better, [2: 5] or [2: 6)
+        """
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx)
+        return torch.LongTensor([st_idx, ed_idx])
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+class StartEndEvalDataset(Dataset):
+    """
+    init_data_mode: `video_query` or `video_only` or `query_only`,
+        it indicates which data to load when initialize the Dataset object.
+    data_mode: `context` or `query`, it indicates which data to return for self.__get_item__()
+    desc_bert_path_or_handler: h5py.File object or str path
+    vid_feat_path_or_handler: h5py.File object or str path
+    eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with
+        max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals.
+    load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval.
+    data_ratio: percentage of query data to use.
+    """
+    def __init__(self, data_path=None,
+                 desc_bert_path_or_handler=None, max_desc_len=None,  max_ctx_len=None,
+                 sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
+                 corpus_path=None, clip_length=None,
+                 ctx_mode="video", data_mode="context",
+                 h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True):
+        self.ctx_mode = ctx_mode
+        self.load_gt_video = False
+        self.data_ratio = data_ratio  # only affect query data
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+        self.data_mode = None
+        self.set_data_mode(data_mode)
+        self.max_desc_len = max_desc_len
+        self.max_ctx_len = max_ctx_len
+        self.data_path = data_path
+        self.annotations = load_json(data_path)
+        self.ground_truth = self.get_relevant_moment_gt()
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+        video_data = load_json(corpus_path)
+        self.video_data = [{"vid_name": k, "duration": v} for k, v in video_data.items()]
+        self.video2idx = {k: v for k, v in video_data.items()}
+        self.clip_length = clip_length
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+    def get_relevant_moment_gt(self):
+        gt_all = {}
+        for data in self.annotations:
+            gt_all[data["query_id"]] = data["relevant_moment"]
+        return gt_all
+    def set_data_mode(self, data_mode):
+        """context or query"""
+        assert data_mode in ["context", "query"]
+        self.data_mode = data_mode
+    # def load_gt_vid_name_for_query(self, load_gt_video):
+    #     """load_gt_video: bool, affect the returned value of self._get_item_query"""
+    #     if load_gt_video:
+    #         assert "vid_name" in self.query_data[0]
+    #     self.load_gt_video = load_gt_video
+    def __len__(self):
+        if self.data_mode == "context":
+            return len(self.video_data)
+        else:
+            return len(self.annotations)
+    def __getitem__(self, index):
+        if self.data_mode == "context":
+            return self._get_item_context(index)
+        else:
+            return self._get_item_query(index)
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+    def _get_item_query(self, index):
+        """Need to batch"""
+        raw_data = self.annotations[index]
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["query"],
+            vid_name=raw_data["video_name"] if self.load_gt_video else None
+        )
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+        return dict(meta=meta, model_inputs=model_inputs)
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+        Returns:
+            [st_idx, ed_idx]: int,
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        Given ts = [5, 9], st_idx = 3, ed_idx = 6,
+        clips should be indexed as [3: 6), the translated back ts should be [4.5:9].
+        # TODO which one is better, [2: 5] or [2: 6)
+        """
+        # TODO ed_idx -= 1, should also modify relevant code in inference.py
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length) - 1, max_idx)  # st_idx could be the same as ed_idx
+        return torch.LongTensor([st_idx, ed_idx])
+    def _get_item_context(self, index):
+        """No need to batch, since it has already been batched here"""
+        raw_data = self.video_data[index]
+        # initialize with basic data
+        meta = dict(
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+        )
+        model_inputs = dict()
+        ctx_l = 0
+        if self.use_video:
+            video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clip, D)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+            ctx_l = len(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clips, D_t)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+            ctx_l = len(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+        if self.use_tef:
+            ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
+            tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
+            tef_ed = tef_st + 1.0 / ctx_l
+            tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
+            model_inputs["tef_feat"] = tef
+        else:
+            model_inputs["tef_feat"] = torch.zeros((2, 2))
+        if self.use_video and self.use_tef:
+            model_inputs["video_feat"] = torch.cat(
+                [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D+2)
+        if self.use_sub and self.use_tef:
+            model_inputs["sub_feat"] = torch.cat(
+                [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D_t+2)
+        return dict(meta=meta, model_inputs=model_inputs)
+def start_end_collate(batch):
+    batch_meta = [e["meta"] for e in batch]  # seems no need to collate ?
+    model_inputs_keys = batch[0]["model_inputs"].keys()
+    batched_data = dict()
+    for k in model_inputs_keys:
+        if "feat" in k:
+            batched_data[k] = pad_sequences_1d(
+                [e["model_inputs"][k] for e in batch], dtype=torch.float32, fixed_length=None)
+    if "st_ed_indices" in model_inputs_keys:
+        batched_data["st_ed_indices"] = torch.stack(
+            [e["model_inputs"]["st_ed_indices"] for e in batch], dim=0)
+    return batch_meta, batched_data
+def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False):
+    model_inputs = {}
+    for k, v in batched_model_inputs.items():
+        if "feat" in k:
+            model_inputs[k] = v[0].to(device, non_blocking=non_blocking)
+            model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking)
+        else:
+            model_inputs[k] = v.to(device, non_blocking=non_blocking)
+    return model_inputs
+if __name__ == '__main__':
+    from baselines.crossmodal_moment_localization.config import BaseOptions
+    options = BaseOptions().parse()