diff --git a/baselines/__init__.py b/baselines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/baselines/__pycache__/__init__.cpython-311.pyc b/baselines/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54c099d3ef7936fc280d7487c04591f749de52e1
Binary files /dev/null and b/baselines/__pycache__/__init__.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/README.md b/baselines/clip_alignment_with_language/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8629394429e1dee1b3dafe346a49569885f851c8
--- /dev/null
+++ b/baselines/clip_alignment_with_language/README.md
@@ -0,0 +1,25 @@
+# Clip Alignment With Language
+This folder contains the CAL model described in the paper
+```
+@article{Escorcia2019TemporalLO,
+  title={Temporal Localization of Moments in Video Collections with Natural Language},
+  author={Victor Escorcia and Mattia Soldan and Josef Sivic and Bernard Ghanem and Bryan Russell},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1907.12763}
+}
+```
+
+It also resembles the MCN model in
+```
+@article{Hendricks2017LocalizingMI,
+  title={Localizing Moments in Video with Natural Language},
+  author={Lisa Anne Hendricks and Oliver Wang and Eli Shechtman and Josef Sivic and Trevor Darrell and Bryan C. Russell},
+  journal={2017 IEEE International Conference on Computer Vision (ICCV)},
+  year={2017},
+  pages={5804-5813}
+}
+```
+
+Disclaimer: This code is implemented by [Jie Lei](http://www.cs.unc.edu/~jielei/) for the TVR dataset, 
+it does not guarantee the reproducibility of the original authors' results.
\ No newline at end of file
diff --git a/baselines/clip_alignment_with_language/__init__.py b/baselines/clip_alignment_with_language/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d2ae334fa75b3ba2568a715e4fb67ebf04c38d1
Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/__init__.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eab312801d33f658c8d6eb196feaba542e3eb4ba
Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/config.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d01ecf759821ef426656300ffcbf3ab974383a75
Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/inference.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57ced9a9be2217629a93e3a0d92727b38c11953a
Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/model.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc b/baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a554bc2a28878ad9c0450a798831f8d1d6abdae
Binary files /dev/null and b/baselines/clip_alignment_with_language/__pycache__/proposal_retrieval_dataset.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/config.py b/baselines/clip_alignment_with_language/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3ba382f839f0bd80d7becb15c753a37428bbb3
--- /dev/null
+++ b/baselines/clip_alignment_with_language/config.py
@@ -0,0 +1,207 @@
+import os
+import time
+import torch
+import argparse
+
+from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
+from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs
+
+
+class BaseOptions(object):
+    saved_option_filename = "opt.json"
+    ckpt_filename = "model.ckpt"
+    tensorboard_log_dir = "tensorboard_log"
+    train_log_filename = "train.log.txt"
+    eval_log_filename = "eval.log.txt"
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        self.initialized = False
+        self.opt = None
+
+    def initialize(self):
+        self.initialized = True
+        self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
+        self.parser.add_argument("--eval_split_name", type=str, default="val",
+                                 help="should match keys in corpus_path, must set for VCMR")
+        self.parser.add_argument("--debug", action="store_true",
+                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
+        self.parser.add_argument("--data_ratio", type=float, default=1.0,
+                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
+                                      "Use small portion for debug purposes. Note this is different from --debug, "
+                                      "which works by breaking the loops, typically they are not used together.")
+        self.parser.add_argument("--results_root", type=str, default="results")
+        self.parser.add_argument("--exp_id", type=str, default="res", help="id of the current run")
+        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
+        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
+        self.parser.add_argument("--num_workers", type=int, default=8,
+                                 help="num subprocesses used to load the data, 0: use main process")
+        self.parser.add_argument("--no_core_driver", action="store_true",
+                                 help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
+        self.parser.add_argument("--no_pin_memory", action="store_true",
+                                 help="Don't use pin_memory=True for dataloader. "
+                                      "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")
+
+        # training config
+        self.parser.add_argument("--lr", type=float, default=0.05, help="learning rate")
+        self.parser.add_argument("--wd", type=float, default=0, help="weight decay")
+        self.parser.add_argument("--momentum", type=float, default=0.95, help="momentum for SGD")
+        self.parser.add_argument("--n_epoch", type=int, default=108, help="number of epochs to run")
+        self.parser.add_argument("--max_es_cnt", type=int, default=108, help="number of epochs to early stop")
+        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+        self.parser.add_argument("--eval_query_bsz", type=int, default=1000,
+                                 help="mini-batch size at inference, for query")
+        self.parser.add_argument("--eval_proposal_bsz", type=int, default=200,
+                                 help="mini-batch size at inference, for proposals")
+        self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
+        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
+        self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss")
+        self.parser.add_argument("--inter_loss_weight", type=float, default=0.4, help="margin for ranking loss")
+        self.parser.add_argument("--loss_type", type=str, default="hinge", choices=["hinge", "lse"],
+                                 help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
+
+        # Model and Data config
+        self.parser.add_argument("--max_sub_l", type=int, default=50,
+                                 help="max length of all sub sentence 97.71 under 50 for 3 sentences")
+        self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
+        self.parser.add_argument("--pos_iou_thd", type=float, default=0.7, help="moments with IoU >= as positive")
+        self.parser.add_argument("--neg_iou_thd", type=float, default=0.35, help="moments with IoU < as negative")
+
+        self.parser.add_argument("--train_path", type=str, default=None)
+        self.parser.add_argument("--eval_path", type=str, default=None,
+                                 help="Evaluating during training, for Dev set. If None, will only do training, "
+                                      "anet_cap and charades_sta has no dev set, so None")
+        self.parser.add_argument("--external_train_vr_res_path", type=str, default=None,
+                                 help="if set, use external video retrieval results to guide "
+                                      "inter-nvideo negative sampling. ")
+        self.parser.add_argument("--init_ckpt_path", type=str, default=None,
+                                 help="init model parameters from checkpoint. Use absolute path")
+        self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None,
+                                 help="if set, use external video retrieval results to guide evaluation. ")
+        self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features")
+        self.parser.add_argument("--word2idx_path", type=str,
+                                 help="a dict, {word: word_idx, ...}, "
+                                      "special tokens are {<pad>: 0, <unk>: 1, <eos>: 2}")
+        self.parser.add_argument("--vocab_size", type=int, default=-1,
+                                 help="Set automatically to len(word2idx)")
+        self.parser.add_argument("--glove_path", type=str,
+                                 help="path to file containing the GloVe embeddings for words in word2idx")
+        self.parser.add_argument("--desc_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--desc_feat_size", type=int, default=768)
+        self.parser.add_argument("--ctx_mode", type=str,
+                                 choices=["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"],
+                                 help="which context to use. a combination of [video, sub, tef]")
+        self.parser.add_argument("--corpus_path", type=str, default=None)
+        self.parser.add_argument("--vid_feat_path", type=str, default="")
+        self.parser.add_argument("--no_norm_vfeat", action="store_true",
+                                 help="Do not do normalization on video feat, use it when using i3d_resnet concat feat")
+        self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
+        self.parser.add_argument("--clip_length", type=float, default=None,
+                                 help="each video will be uniformly segmented into small clips, "
+                                      "will automatically loaded from ProposalConfigs if None")
+        self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
+
+        self.parser.add_argument("--model_type", default="cal", choices=["cal", "mcn"])
+        self.parser.add_argument("--embedding_size", type=int, default=768)
+        self.parser.add_argument("--lstm_hidden_size", type=int, default=256)
+        self.parser.add_argument("--visual_hidden_size", type=int, default=256)
+        self.parser.add_argument("--output_size", type=int, default=256)
+
+        # post processing
+        self.parser.add_argument("--nms_thd", type=float, default=-1,
+                                 help="additionally use non-maximum suppression "
+                                      "(or non-minimum suppression for distance)"
+                                      "to post-processing the predictions. "
+                                      "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,")
+        self.parser.add_argument("--max_after_nms", type=int, default=100, help="Stores at max_after_nms for eval")
+        self.parser.add_argument("--max_before_nms", type=int, default=300, help="Max before nms")
+        self.parser.add_argument("--use_intermediate", action="store_true",
+                                 help="Whether to use/save intermediate results to results directory."
+                                      "Might want use this if we are going to ")
+
+    def save_args(self, opt):
+        args = vars(opt)
+        # Save settings
+        if not isinstance(self, TestOptions):
+            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
+            save_json(args, option_file_path, save_pretty=True)
+
+    def parse(self):
+        if not self.initialized:
+            self.initialize()
+        opt = self.parser.parse_args()
+
+        if opt.debug:
+            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
+            opt.no_core_driver = True
+            opt.num_workers = 0
+
+        if isinstance(self, TestOptions):
+            # modify model_dir to absolute path
+            opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
+            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
+            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
+                if arg not in ["results_root", "num_workers", "nms_thd", "debug", "eval_split_name", "eval_path",
+                               "use_intermediate", "external_inference_vr_res_path"]:
+                    setattr(opt, arg, saved_options[arg])
+            # opt.no_core_driver = True
+        else:
+            if opt.exp_id is None:
+                raise ValueError("--exp_id is required for at a training option!")
+
+            if opt.clip_length is None:
+                opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
+            opt.results_dir = os.path.join(opt.results_root,
+                                           "-".join([opt.dset_name, opt.model_type, opt.ctx_mode, opt.exp_id,
+                                                     time.strftime("%Y_%m_%d_%H_%M_%S")]))
+            mkdirp(opt.results_dir)
+            # save a copy of current code
+            code_dir = os.path.dirname(os.path.realpath(__file__))
+            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
+            make_zipfile(code_dir, code_zip_filename,
+                         enclosing_dir="code",
+                         exclude_dirs_substring="results",
+                         exclude_dirs=["results", "debug_results", "__pycache__"],
+                         exclude_extensions=[".pyc", ".ipynb", ".swap"])
+
+        self.save_args(opt)
+
+        if "sub" in opt.ctx_mode:
+            assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
+
+        if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
+            assert opt.no_norm_vfeat
+
+        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
+        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
+        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
+        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
+        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
+        opt.h5driver = None if opt.no_core_driver else "core"
+        # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
+        opt.pin_memory = not opt.no_pin_memory
+        opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
+
+        # Display settings
+        print("------------ Options -------------\n{}\n-------------------"
+              .format({str(k): str(v) for k, v in sorted(vars(opt).items())}))
+        self.opt = opt
+        return opt
+
+
+class TestOptions(BaseOptions):
+    """add additional options for evaluating"""
+    def initialize(self):
+        BaseOptions.initialize(self)
+        # also need to specify --eval_split_name
+        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
+        self.parser.add_argument("--model_dir", type=str,
+                                 help="dir contains the model file, will be converted to absolute path afterwards")
+        self.parser.add_argument("--tasks", type=str, nargs="+", choices=["VCMR", "SVMR", "VR"], default="SVMR",
+                                 help="Which tasks to run."
+                                      "VCMR: Video Corpus Moment Retrieval;"
+                                      "SVMR: Single Video Moment Retrieval;"
+                                      "VR: regular Video Retrieval.")
diff --git a/baselines/clip_alignment_with_language/inference.py b/baselines/clip_alignment_with_language/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..b11d669682346aef7b8eb09788348a948847e14b
--- /dev/null
+++ b/baselines/clip_alignment_with_language/inference.py
@@ -0,0 +1,672 @@
+import os
+import time
+import math
+import pprint
+import numpy as np
+from tqdm import tqdm, trange
+from collections import defaultdict, OrderedDict
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+
+from baselines.clip_alignment_with_language.config import TestOptions
+from baselines.clip_alignment_with_language.model import CALWithSub
+from baselines.clip_alignment_with_language.proposal_retrieval_dataset import \
+    proposal_retrieval_collate, ProposalRetrievalEvalDataset, prepare_batch_inputs
+from utils.basic_utils import save_jsonl, save_json, load_json
+from utils.temporal_nms import temporal_non_maximum_suppression
+from utils.tensor_utils import pad_sequences_1d
+from standalone_eval.eval import eval_retrieval
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list):
+    """
+    Args:
+        proposals_embedding_list: list(torch.Tensor), bsz * (N_prop, N_clips, D_o)
+        proposals_mask_list: list(torch.Tensor), bsz * (N_prop, N_clips)
+    """
+    if len(proposals_embedding_list) == 1:
+        return proposals_embedding_list[0], proposals_mask_list[0]
+    else:  # > 1
+        max_n_clips = max([e.shape[1] for e in proposals_embedding_list])
+        n_proposals = sum([len(e) for e in proposals_embedding_list])
+        d = proposals_embedding_list[0].shape[2]
+        proposals_embedding = proposals_embedding_list[0].new_zeros((n_proposals, max_n_clips, d))
+        proposals_mask = proposals_mask_list[0].new_zeros((n_proposals, max_n_clips))
+        mask_lengths = [0, ] + [len(m) for m in proposals_mask_list]
+        mask_cumsum_lengths = np.cumsum(mask_lengths)
+        for idx, (e, m) in enumerate(zip(proposals_embedding_list, proposals_mask_list)):
+            proposals_embedding[mask_cumsum_lengths[idx]:mask_cumsum_lengths[idx + 1], :e.shape[1]] = e
+            proposals_mask[mask_cumsum_lengths[idx]:mask_cumsum_lengths[idx + 1], :m.shape[1]] = m
+        return proposals_embedding, proposals_mask
+
+
+def compute_query_embeddings(model, eval_dataset, opt, load_gt_vid_name):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated size 20,000 (query) * 100 (hsz) * 4 / (1024**2) = 7.63 MB
+    """
+    model.eval()
+    eval_dataset.set_data_mode("query")
+    eval_dataset.load_gt_vid_name_for_query(load_gt_vid_name)
+    query_eval_loader = DataLoader(eval_dataset,
+                                   collate_fn=proposal_retrieval_collate,
+                                   batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers,
+                                   shuffle=False,
+                                   pin_memory=opt.pin_memory)
+    global_meta_list = []  # list(dicts)
+    # n_query = min(len(eval_dataset), opt.eval_query_bsz) if opt.debug else len(eval_dataset)
+    n_query = len(eval_dataset)
+    global_query_embedding = torch.empty((n_query,
+                                          model.config.output_size),
+                                         dtype=torch.float32, device=opt.device)  # (N_q, D_o)
+    for idx, batch in tqdm(enumerate(query_eval_loader),
+                           desc="Computing q embedding",
+                           total=len(query_eval_loader)):
+        global_meta_list.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        global_query_embedding[idx * opt.eval_query_bsz: (idx + 1) * opt.eval_query_bsz] = \
+            model.query_encoder(**model_inputs)
+
+        if opt.debug:
+            break
+    return global_meta_list, global_query_embedding
+
+
+def compute_proposal_embeddings(model, eval_dataset, opt):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated 1000 (videos) * 300 (proposals) * 20 (clips) * 100 (hsz) * 4 / (1024 ** 3) = 2.24 GB
+    """
+    model.eval()
+    eval_dataset.set_data_mode("context")
+    global_meta_list = []  # list(dicts)
+    global_proposal_video_embedding_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips, D_o]
+    global_proposal_sub_embedding_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips, D_o]
+    global_proposal_video_mask_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips]
+    global_proposal_sub_mask_list = []  # list(torch.tensor), N_videos * [N_prop, N_clips]
+    for idx, single_video_info in tqdm(enumerate(eval_dataset),
+                                       desc="Computing prop embedding for videos",
+                                       total=len(eval_dataset)):
+        global_meta_list.append(single_video_info["meta"])
+        if model.use_video or model.tef_only:
+            proposals_features_list = single_video_info["model_inputs"]["video_moment_features_list"]
+            proposals_mask_list = single_video_info["model_inputs"]["video_moment_mask_list"]
+            proposals_mask_list = [e.to(opt.device, non_blocking=opt.pin_memory) for e in proposals_mask_list]
+            proposals_embedding_list = []  # (N_prop, D_o)
+            for feat in proposals_features_list:
+                proposals_embedding_list.append(
+                    model.moment_encoder(feat.to(opt.device, non_blocking=opt.pin_memory), module_name="video"))
+            p, m = combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list)
+            global_proposal_video_embedding_list.append(p)
+            global_proposal_video_mask_list.append(m)
+        else:
+            global_proposal_video_embedding_list.append(None)
+
+        if model.use_sub:
+            proposals_features_list = single_video_info["model_inputs"]["sub_moment_features_list"]
+            proposals_mask_list = single_video_info["model_inputs"]["sub_moment_mask_list"]
+            proposals_mask_list = [e.to(opt.device, non_blocking=opt.pin_memory) for e in proposals_mask_list]
+            proposals_embedding_list = []  # (N_prop, D_o)
+            for feat in proposals_features_list:
+                proposals_embedding_list.append(
+                    model.moment_encoder(feat.to(opt.device, non_blocking=opt.pin_memory), module_name="sub"))
+            p, m = combine_single_video_proposal_embeddings(proposals_embedding_list, proposals_mask_list)
+            global_proposal_sub_embedding_list.append(p)
+            global_proposal_sub_mask_list.append(m)
+        else:
+            global_proposal_sub_embedding_list.append(None)
+
+        if opt.debug and idx == 100:
+            break
+    global_proposal_mask_list = global_proposal_sub_mask_list if model.use_sub else global_proposal_video_mask_list
+    return global_meta_list, global_proposal_video_embedding_list, \
+           global_proposal_sub_embedding_list, global_proposal_mask_list
+
+
+def compute_query_proposal_distance(model, eval_dataset, opt, tasks=("SVMR",)):
+    """compute and save query and video proposal embeddings,
+    tasks: SVMR (single video moment retrieval), VCMR (video corpus moment retrieval)
+    """
+    is_svmr = "SVMR" in tasks
+    is_vcmr = "VCMR" in tasks
+    query_meta_list, query_embed = compute_query_embeddings(model, eval_dataset, opt,
+                                                            load_gt_vid_name=is_svmr)
+    video_meta_list, video_prop_embed_list, sub_prop_embed_list, prop_mask_list = \
+        compute_proposal_embeddings(model, eval_dataset, opt)
+
+    eval_res = dict(
+        query_meta=query_meta_list,  # N_q * dict()
+        video_meta=video_meta_list,  # N_videos * dict()
+        video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+        query_prop_dist_vcmr=[],  # N_videos * (N_q, N_prop), note N_prop is changing for each video.
+        query_prop_dist_svmr=[],  # N_q * (N_prop, ), each query has a GT video, no need to calc. for all.
+    )
+    if is_vcmr:
+        for v_prop_embed, s_prop_embed, prop_mask in tqdm(
+                zip(video_prop_embed_list, sub_prop_embed_list, prop_mask_list),
+                desc="Computing VCMR q to prop dist for videos",
+                total=len(video_prop_embed_list)):
+            query_prop_dist = model.compute_cdist_inference(
+                query_embed, v_prop_embed, s_prop_embed, prop_mask)  # (N_q, N_prop)
+            eval_res["query_prop_dist_vcmr"].append(query_prop_dist.cpu())
+            if opt.debug:
+                break
+
+    if is_svmr:
+        if opt.debug:
+            debug_query_meta = []
+        # this is different from video2idx
+        svmr_video2meta_idx = {e["vid_name"]: idx for idx, e in enumerate(video_meta_list)}
+        # logger.info("svmr_video2idx {}".format(list(svmr_video2idx.keys())[:3]))
+        for single_q_embed, single_q_meta in tqdm(zip(query_embed, query_meta_list),
+                                                  desc="Computing SVMR q to prop dist for videos",
+                                                  total=len(query_embed)):
+            # logger.info("single_q_meta[vid_name] {}".format(single_q_meta["vid_name"]))
+            if opt.debug:
+                if single_q_meta["vid_name"] not in svmr_video2meta_idx:
+                    continue
+                debug_query_meta.append(single_q_meta)
+            q_gt_vid_meta_idx = svmr_video2meta_idx[single_q_meta["vid_name"]]
+            v_prop_embed = video_prop_embed_list[q_gt_vid_meta_idx]  # [N_prop, N_clips, D_o]
+            s_prop_embed = sub_prop_embed_list[q_gt_vid_meta_idx]  # [N_prop, N_clips, D_o]
+            prop_mask = prop_mask_list[q_gt_vid_meta_idx]  # [N_prop, N_clips]
+            query_prop_dist = model.compute_cdist_inference(
+                single_q_embed.unsqueeze(0), v_prop_embed, s_prop_embed, prop_mask)  # (1, N_prop)
+            eval_res["query_prop_dist_svmr"].append(query_prop_dist.squeeze(0).cpu().numpy())
+        if opt.debug:
+            eval_res["query_meta"] = debug_query_meta
+    return eval_res
+
+
+def filter_vcmr_by_nms(all_video_predictions, nms_threshold=0.6,
+                       max_before_nms=1000, max_after_nms=100, score_col_idx=3):
+    """ Apply non-maximum suppression for all the predictions for each video.
+    1) group predictions by video index
+    2) apply nms individually for each video index group
+    3) combine and sort the predictions
+    Args:
+        all_video_predictions: list(sublist),
+            Each sublist is [video_idx (int), st (float), ed(float), score (float)]
+            Note the scores are negative distances.
+        nms_threshold: float
+        max_before_nms: int
+        max_after_nms: int
+        score_col_idx: int
+    Returns:
+
+    """
+    predictions_neg_by_video_group = defaultdict(list)
+    for pred in all_video_predictions[:max_before_nms]:
+        predictions_neg_by_video_group[pred[0]].append(pred[1:])  # [st (float), ed(float), score (float)]
+
+    predictions_by_video_group_neg_after_nms = dict()
+    for video_idx, grouped_preds in predictions_neg_by_video_group.items():
+        predictions_by_video_group_neg_after_nms[video_idx] = \
+            temporal_non_maximum_suppression(grouped_preds, nms_threshold=nms_threshold)
+
+    predictions_after_nms = []
+    for video_idx, grouped_preds in predictions_by_video_group_neg_after_nms.items():
+        for pred in grouped_preds:
+            pred = [video_idx] + pred  # [video_idx (int), st (float), ed(float), score (float)]
+            predictions_after_nms.append(pred)
+
+    # ranking happens across videos
+    predictions_after_nms = sorted(predictions_after_nms,
+                                   key=lambda x: x[score_col_idx],
+                                   reverse=True)[:max_after_nms]  # descending order
+    return predictions_after_nms
+
+
+def post_processing_vcmr_nms(vcmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100):
+    """
+    vcmr_res: list(dict), each dict is{
+        "desc": str,
+        "desc_id": int,
+        "predictions": list(sublist)  # each sublist is
+            [video_idx (int), st (float), ed(float), score (float)], video_idx could be different
+    }
+    """
+    processed_vcmr_res = []
+    for e in vcmr_res:
+        e["predictions"] = filter_vcmr_by_nms(e["predictions"],
+                                              nms_threshold=nms_thd,
+                                              max_before_nms=max_before_nms,
+                                              max_after_nms=max_after_nms)
+        processed_vcmr_res.append(e)
+    return processed_vcmr_res
+
+
+def post_processing_svmr_nms(svmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100):
+    """
+    svmr_res: list(dict), each dict is
+        {"desc": str,
+         "desc_id": int,
+         "predictions": list(sublist)  # each sublist is
+            [video_idx (int), st (float), ed(float), score (float)], video_idx is the same.
+         }
+    """
+    processed_svmr_res = []
+    for e in svmr_res:
+        # the predictions are sorted inside the nms func.
+        _predictions = [d[1:] for d in e["predictions"][:max_before_nms]]
+        _predictions = temporal_non_maximum_suppression(
+            _predictions, nms_threshold=nms_thd)[:max_after_nms]
+        _video_id = e["predictions"][0][0] # video_id is the same for all predictions
+        e["predictions"] = [[_video_id, ] + d for d in _predictions]
+        processed_svmr_res.append(e)
+    return processed_svmr_res
+
+
+def generate_vcmr_predictions_from_res_with_external(eval_res, max_prop_per_query=300, query_bsz_in_sort=1000):
+    """ This function is for Video Corpus Moment Retrieval (VCMR).
+    Generate prediction file which could be evaluated using standalone_eval.eval.
+    Args:
+        eval_res: dict(
+            query_meta=query_meta_list,  # N_q * dict(), each dict is {"desc_id": int, "desc": str}
+            video_meta=video_meta_list,  # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray}
+            video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+            video_bsz_in_sort=[],  # N_videos * (N_q, N_prop)
+        )
+        max_prop_per_query: int or None. If None, generate ranking for all possible moments, else generate top {}.
+        query_bsz_in_sort: int, only sort a subset of queries at a time, it will be too large to sort all queries.
+    return:
+        list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)),
+            each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance.
+    """
+    # video2idx
+    video2idx = eval_res["video2idx"]
+    video_meta = eval_res["video_meta"]
+    query_meta = eval_res["query_meta"]
+    video_idx2meta_idx = {video2idx[m["vid_name"]]: i for i, m in enumerate(video_meta)}
+    external_query2video = eval_res["external_query2video"] if "external_query2video" in eval_res else None
+    # 「query idx： [video meta idx]」
+    external_query2video_meta_idx = {k: [video_idx2meta_idx[e] for e in v] for k, v in external_query2video.items()}
+
+    external_ordered_video_meta_indices = torch.LongTensor(
+        [external_query2video_meta_idx[e["desc_id"]] for e in query_meta])  # (Nq, 5)
+    top_n_retrieved = external_ordered_video_meta_indices.shape[1]
+
+    # (N_videos, N_prop, N_q), (N_videos, N_prop)
+    padded_dist, padded_mask = pad_sequences_1d([e.transpose(0, 1) for e in eval_res["query_prop_dist_vcmr"]],
+                                                dtype=eval_res["query_prop_dist_vcmr"][0].dtype,
+                                                device=eval_res["query_prop_dist_vcmr"][0].device)
+    # putting 'NaN' into the invalid bits, torch.sort considers 'NaN' as larger than any number!!!
+    padded_dist += (padded_mask.unsqueeze(2) == 0).float() * 1e10
+    n_videos, n_prop, n_q = padded_dist.shape
+    padded_dist = padded_dist.permute(2, 0, 1)  # (N_q, N_videos, N_prop)
+
+    # get only top retrieved, N_videos now decreased to top_n_retrieved
+    row_indices = torch.arange(n_q, device=padded_dist.device)
+    padded_dist = torch.stack([
+        padded_dist[row_indices, external_ordered_video_meta_indices[:, col_idx]]
+        for col_idx in range(top_n_retrieved)], dim=1)  # (N_q, 5, N_prop)
+    n_videos = top_n_retrieved
+
+    padded_dist = padded_dist.view(n_q, -1).contiguous()  # (N_q, N_video*N_prop)
+    print("n_videos, n_prop, n_q {}".format((n_videos, n_prop, n_q)))
+    print("padded_dist, {}".format(padded_dist.shape))
+
+    sorted_distances, sorted_indices = torch.topk(padded_dist.to(torch.device("cuda:0"), non_blocking=True),
+                                                  k=min(max_prop_per_query, n_videos * n_prop),
+                                                  dim=1, largest=False, sorted=True)  # (N_q, max_prop_per_query) * 2
+    print("orted_distances {}, sorted_indices {}".format(sorted_distances.shape, sorted_indices.shape))
+    sorted_distances = - sorted_distances.cpu().numpy()
+
+    # (N_q, max_prop_per_query) * 2, prop_indices: inside video indices.
+    video_meta_indices_retrieved = torch.floor(sorted_indices.float() / n_prop).long().cpu().numpy()
+    # map back to original video idx (not video meta idx, but real video idx)
+    video_indices = np.array([[external_query2video[query_meta[i]["desc_id"]][j] for j in r]
+                              for i, r in enumerate(video_meta_indices_retrieved)])  # (N_q, max_prop_per_query)
+    prop_indices = torch.remainder(sorted_indices, n_prop).cpu().numpy()  # (N_q, max_prop_per_query)
+    print("video_indices {}, prop_indices {}".format(video_indices.shape, prop_indices.shape))
+
+    vr_res = []
+    for i in trange(n_q, desc="[VR] Loop over queries to generate predictions"):
+        row = video_indices[i]
+        score_row = - sorted_distances[i]
+        cur_vr_redictions = []
+        for j, video_idx in enumerate(row):
+            cur_vr_redictions.append([int(video_idx), 0, 0, float(score_row[j])])
+        cur_query_pred = dict(
+            desc_id=query_meta[i]["desc_id"],
+            desc=query_meta[i]["desc"],
+            predictions=cur_vr_redictions
+        )
+        vr_res.append(cur_query_pred)
+
+    vcmr_res = []
+    logger.debug("sorted_indices {}".format(sorted_indices.shape))
+    logger.debug("sorted_distances {}".format(sorted_distances.shape))
+    out_bounds_cnt = 0
+    for idx, (v_row_indices, p_row_indices) in tqdm(enumerate(zip(video_indices, prop_indices)),
+                                                    desc="[VCMR] Loop over queries to generate predictions",
+                                                    total=n_q):  # query
+        sorted_distances_row = - sorted_distances[idx]  # converted to negative distance
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = []
+        for col_idx, (v_col_idx, p_col_idx) in enumerate(zip(v_row_indices, p_row_indices)):
+            cur_proposals = eval_res["video_meta"][video_idx2meta_idx[v_col_idx]]["proposals"]
+            cur_pred = []
+            cur_pred += [int(v_col_idx), ]
+            # what is wrong with the indexing below??? (out of bounds), but results seems fine???
+            # Not a bug. Since there might be less than max_before_nms proposals from the top retrieved videos
+            if p_col_idx >= len(cur_proposals):
+                out_bounds_cnt += 1
+                p_col_idx = len(cur_proposals)-1
+            cur_pred += cur_proposals[p_col_idx].tolist()
+            cur_pred += [float(sorted_distances_row[col_idx])]
+            cur_ranked_predictions.append(cur_pred)
+        cur_query_pred = dict(
+            desc_id=eval_res["query_meta"][idx]["desc_id"],
+            desc=eval_res["query_meta"][idx]["desc"],
+            predictions=cur_ranked_predictions
+        )
+        vcmr_res.append(cur_query_pred)
+    logger.info("[DEBUG] out_bounds_cnt {}".format(out_bounds_cnt))
+    return vcmr_res, vr_res
+
+
+def generate_vcmr_predictions_from_res(eval_res, max_prop_per_query=300, query_bsz_in_sort=1000):
+    """ This function is for Video Corpus Moment Retrieval (VCMR).
+    Generate prediction file which could be evaluated using standalone_eval.eval.
+    Args:
+        eval_res: dict(
+            query_meta=query_meta_list,  # N_q * dict(), each dict is {"desc_id": int, "desc": str}
+            video_meta=video_meta_list,  # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray}
+            video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+            video_bsz_in_sort=[],  # N_videos * (N_q, N_prop)
+        )
+        max_prop_per_query: int or None. If None, generate ranking for all possible moments, else generate top {}.
+        query_bsz_in_sort: int, only sort a subset of queries at a time, it will be too large to sort all queries.
+    return:
+        list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)),
+            each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance.
+    """
+    # video2idx
+    video2idx = eval_res["video2idx"]
+
+    # (N_videos, N_prop, N_q), (N_videos, N_prop)
+    padded_dist, padded_mask = pad_sequences_1d([e.transpose(0, 1) for e in eval_res["query_prop_dist_vcmr"]],
+                                                dtype=eval_res["query_prop_dist_vcmr"][0].dtype,
+                                                device=eval_res["query_prop_dist_vcmr"][0].device)
+    # putting 'NaN' into the invalid bits, torch.sort considers 'NaN' as larger than any number!!!
+    padded_dist += (padded_mask.unsqueeze(2) == 0).float() * 1e10
+    n_videos, n_prop, n_q = padded_dist.shape
+    print("n_videos, n_prop, n_q {}".format((n_videos, n_prop, n_q)))
+    padded_dist = padded_dist.view(n_videos * n_prop, n_q).transpose(0, 1).contiguous()  # (N_q, N_video*N_prop)
+    print("padded_dist, {}".format(padded_dist.shape))
+
+    sorted_distances, sorted_indices = torch.topk(padded_dist.to(torch.device("cuda:0"), non_blocking=True),
+                                                  k=min(max_prop_per_query, n_videos * n_prop),
+                                                  dim=1, largest=False, sorted=True)  # (N_q, max_prop_per_query) * 2
+    sorted_distances = - sorted_distances.cpu().numpy()
+
+    # (N_q, max_prop_per_query) * 2, prop_indices: inside video indices.
+    video_meta_indices = torch.floor(sorted_indices.float() / n_prop).long().cpu().numpy()
+    prop_indices = torch.remainder(sorted_indices, n_prop).cpu().numpy()
+
+    vr_res = []
+    query_meta = eval_res["query_meta"]
+    for i in trange(n_q, desc="[VR] Loop over queries to generate predictions"):
+        row = video_meta_indices[i]
+        score_row = - sorted_distances[i]
+        cur_vr_redictions = []
+        for j, meta_idx in enumerate(row):
+            video_idx = video2idx[eval_res["video_meta"][meta_idx]["vid_name"]]
+            cur_vr_redictions.append([video_idx, 0, 0, float(score_row[j])])
+        cur_query_pred = dict(
+            desc_id=query_meta[i]["desc_id"],
+            desc=query_meta[i]["desc"],
+            predictions=cur_vr_redictions
+        )
+        vr_res.append(cur_query_pred)
+
+    vcmr_res = []
+    logger.debug("sorted_indices {}".format(sorted_indices.shape))
+    logger.debug("sorted_distances {}".format(sorted_distances.shape))
+    for idx, (vm_row_indices, p_row_indices) in tqdm(enumerate(zip(video_meta_indices, prop_indices)),
+                                                     desc="[VCMR] Loop over queries to generate predictions",
+                                                     total=n_q):  # query
+        sorted_distances_row = - sorted_distances[idx]  # converted to negative distance
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = []
+        for col_idx, (v_col_idx, p_col_idx) in enumerate(zip(vm_row_indices, p_row_indices)):
+            cur_pred = []
+            cur_pred += [video2idx[eval_res["video_meta"][v_col_idx]["vid_name"]], ]
+            cur_pred += eval_res["video_meta"][v_col_idx]["proposals"][p_col_idx].tolist()
+            cur_pred += [float(sorted_distances_row[col_idx])]
+            cur_ranked_predictions.append(cur_pred)
+        cur_query_pred = dict(
+            desc_id=eval_res["query_meta"][idx]["desc_id"],
+            desc=eval_res["query_meta"][idx]["desc"],
+            predictions=cur_ranked_predictions
+        )
+        vcmr_res.append(cur_query_pred)
+    return vcmr_res, vr_res
+
+
+def generate_svmr_predictions_from_res(eval_res, max_prop_per_query=None):
+    """ This function is for Video Corpus Moment Retrieval (VCMR).
+    Generate prediction file which could be evaluated using standalone_eval.eval.
+    Args:
+        eval_res: dict(
+            query_meta=query_meta_list,  # N_q * dict(), each dict is {"desc_id": int, "desc": str}
+            video_meta=video_meta_list,  # N_videos * dict(), {"vid_name": str, "duration": float, "proposals": ndarray}
+            video2idx=eval_dataset.video2idx,  # dict {vid_name: index}
+            query_prop_dist_svmr=[],  # N_q * (N_prop, )
+        )
+        max_prop_per_query: not used
+    return:
+        list(dicts): each dict is dict(desc=str, desc_id=int, predictions=list(sublist)),
+            each sublist is [vid_name (str), st (float), ed (float), score (float)], score is negative distance.
+    """
+    video2idx = eval_res["video2idx"]
+
+    svmr_res = []
+    svmr_video2meta_idx = {e["vid_name"]: idx for idx, e in enumerate(eval_res["video_meta"])}
+    for idx, (q_p_dist, q_m) in tqdm(enumerate(zip(eval_res["query_prop_dist_svmr"], eval_res["query_meta"])),
+                                     desc="Loop over queries to generate predictions",
+                                     total=len(eval_res["query_prop_dist_svmr"])):  # query
+        sorted_indices = np.argsort(q_p_dist)  # (N_prop, )  # ascending order, distance
+        if max_prop_per_query is not None:
+            sorted_indices = sorted_indices[:max_prop_per_query]
+        v_eval_idx = video2idx[q_m["vid_name"]]
+        v_meta_idx = svmr_video2meta_idx[q_m["vid_name"]]
+        proposals = eval_res["video_meta"][v_meta_idx]["proposals"]  # (N_p, 2)
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = [
+            [v_eval_idx, ] + proposals[sort_idx].tolist() + [- round(float(q_p_dist[sort_idx]), 4), ]
+            for sort_idx in sorted_indices]
+        cur_query_pred = dict(
+            desc_id=q_m["desc_id"],
+            desc=q_m["desc"],
+            predictions=cur_ranked_predictions
+        )
+        svmr_res.append(cur_query_pred)
+    return svmr_res
+
+
+POST_PROCESSING_MMS_FUNC = {
+    "SVMR": post_processing_svmr_nms,
+    "VCMR": post_processing_vcmr_nms
+}
+
+
+def get_submission_top_n(submission, top_n=100):
+    def get_prediction_top_n(list_dict_predictions, top_n):
+        top_n_res = []
+        for e in list_dict_predictions:
+            e["predictions"] = e["predictions"][:top_n]
+            top_n_res.append(e)
+        return top_n_res
+
+    top_n_submission = dict(video2idx=submission["video2idx"], )
+    for k in submission:
+        if k != "video2idx":
+            top_n_submission[k] = get_prediction_top_n(submission[k], top_n)
+    return top_n_submission
+
+
+def load_external_vr_res(external_vr_res_path, top_n_vr_videos=5):
+    """return a mapping from desc_id to top retrieved video id"""
+    external_vr_res = load_json(external_vr_res_path)
+    external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"]
+    query2video = {e["desc_id"]: [sub_e[0] for sub_e in e["predictions"]] for e in external_vr_res}
+    return query2video
+
+
+def eval_epoch(model, eval_dataset, opt, save_submission_filename,
+               tasks=("SVMR",), max_before_nms=1000, max_after_nms=100):
+    model.eval()
+    logger.info("Computing scores")
+    logger.info("Start timing")
+    # times = []  # do not use
+    # for _ in range(3):
+    #     st_time = time.time()
+    if opt.use_intermediate:
+        intermediate_cache_path = os.path.join(opt.results_dir, "{}_eval_res.pt".format(opt.eval_split_name))
+        if not os.path.exists(intermediate_cache_path):
+            logger.info("Saving intermediate results {}.".format(intermediate_cache_path))
+            eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks)
+            torch.save(eval_res, intermediate_cache_path)
+        else:
+            logger.info("Loading intermediate results {}.".format(intermediate_cache_path))
+            eval_res = torch.load(intermediate_cache_path)
+    else:
+        logger.info("Running without saving intermediate results, you might want to turn on --use_intermediate.")
+        eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks)
+    # del model  # We dont need model anymore
+
+    # eval_res = compute_query_proposal_distance(model, eval_dataset, opt, tasks=tasks)
+
+    logger.info("Generating predictions from scores")
+    eval_submission_raw = dict(video2idx=eval_res["video2idx"])
+    if "SVMR" in tasks:
+        eval_submission_raw["SVMR"] = generate_svmr_predictions_from_res(
+            eval_res, max_prop_per_query=max_before_nms)
+    # vcmr_loading_time = 0
+    if "VCMR" in tasks:
+        if opt.external_inference_vr_res_path is not None:
+            logger.info("Using external VR results from {}".format(opt.external_inference_vr_res_path))
+            # vcmr_loading_time = time.time()
+            eval_res["external_query2video"] = load_external_vr_res(
+                opt.external_inference_vr_res_path, top_n_vr_videos=5)
+            # vcmr_loading_time = time.time() - vcmr_loading_time
+            vcmr_res, vr_res = generate_vcmr_predictions_from_res_with_external(
+                eval_res, max_prop_per_query=max_before_nms)
+        else:
+            vcmr_res, vr_res = generate_vcmr_predictions_from_res(
+                eval_res, max_prop_per_query=max_before_nms)
+        eval_submission_raw["VCMR"] = vcmr_res
+        eval_submission_raw["VR"] = vr_res
+        # times += [time.time() - st_time - vcmr_loading_time]
+    # times = torch.FloatTensor(times)
+    IOU_THDS = (0.5, 0.7)
+
+    logger.info("Saving/Evaluating before nms results")
+    submission_path = os.path.join(opt.results_dir, save_submission_filename)
+    eval_submission = get_submission_top_n(eval_submission_raw, top_n=max_after_nms)
+    if max_after_nms < 1000:
+        save_json(eval_submission, submission_path)
+    else:
+        torch.save(eval_submission, submission_path.replace(".json", ".pt"))
+
+    metrics = eval_retrieval(eval_submission, eval_dataset.query_data,
+                             iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug,
+                             use_desc_type=opt.dset_name == "tvr")
+    # metrics["time_avg"] = float(times.mean())
+    # metrics["time_std"] = float(times.std())
+    save_metrics_path = submission_path.replace(".json", "_metrics.json")
+    save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False)
+    latest_file_paths = [submission_path, save_metrics_path]
+
+    if opt.nms_thd != -1:
+        logger.info("Performing nms with nms_thd {}".format(opt.nms_thd))
+        eval_submission_after_nms = dict(video2idx=eval_submission_raw["video2idx"])
+        for k, nms_func in POST_PROCESSING_MMS_FUNC.items():
+            if k in eval_submission_raw:
+                eval_submission_after_nms[k] = nms_func(eval_submission_raw[k],
+                                                        nms_thd=opt.nms_thd,
+                                                        max_before_nms=max_before_nms,
+                                                        max_after_nms=max_after_nms)
+
+        logger.info("Saving/Evaluating nms results")
+        submission_nms_path = submission_path.replace(".json", "_nms_thd_{}.json".format(opt.nms_thd))
+        save_json(eval_submission_after_nms, submission_nms_path)
+        metrics_nms = eval_retrieval(eval_submission_after_nms, eval_dataset.query_data,
+                                     iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug)
+        save_metrics_nms_path = submission_nms_path.replace(".json", "_metrics.json")
+        save_json(metrics_nms, save_metrics_nms_path, save_pretty=True, sort_keys=False)
+        latest_file_paths += [submission_nms_path, save_metrics_nms_path]
+    else:
+        metrics_nms = None
+    return metrics, metrics_nms, latest_file_paths
+
+
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    model = CALWithSub(checkpoint["model_cfg"])
+    model.load_state_dict(checkpoint["model"])
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+
+
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+    assert opt.eval_path is not None
+    eval_dataset = ProposalRetrievalEvalDataset(
+        dset_name=opt.dset_name,
+        model_type=opt.model_type,
+        eval_split_name=opt.eval_split_name,  # should only be val set
+        data_path=opt.eval_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        corpus_path=opt.corpus_path,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        eval_proposal_bsz=opt.eval_proposal_bsz,
+        ctx_mode=opt.ctx_mode,
+        data_mode="query",
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+    )
+
+    model = setup_model(opt)
+    save_submission_filename = \
+        "inference_{}_{}_{}_predictions_{}.json".format(
+            opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks))
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename, tasks=opt.tasks,
+                       max_before_nms=opt.max_before_nms, max_after_nms=opt.max_after_nms)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+
+if __name__ == '__main__':
+    start_inference()
diff --git a/baselines/clip_alignment_with_language/local_utils/__init__.py b/baselines/clip_alignment_with_language/local_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc b/baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dca00e79ec8256ef8f67c2db3d6d805276c2503b
Binary files /dev/null and b/baselines/clip_alignment_with_language/local_utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc b/baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff81922c6ee768f2dab46dfcd67c9edc0b1b356b
Binary files /dev/null and b/baselines/clip_alignment_with_language/local_utils/__pycache__/compute_proposal_upper_bound.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc b/baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3963685d54d62927885a4c1427d81368674005b
Binary files /dev/null and b/baselines/clip_alignment_with_language/local_utils/__pycache__/proposal.cpython-311.pyc differ
diff --git a/baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py b/baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f4f748c41ebf3a76aaf302b6f94caf3e2098bf
--- /dev/null
+++ b/baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py
@@ -0,0 +1,117 @@
+"""
+Compute oracle upper bound for a given proposal method, which acts like
+a reversed recall, where we recall the GT timestamp pairs in the set of
+generated proposals.
+"""
+import pprint
+import numpy as np
+from tqdm import tqdm
+from collections import Counter
+from utils.basic_utils import load_jsonl, save_json
+from standalone_eval.eval import compute_temporal_iou_batch
+from baselines.clip_alignment_with_language.local_utils.proposal import get_proposal_interface, ProposalConfigs
+
+
+def get_didemo_agreed_ts(times_list):
+    """
+    input example: [[1, 1], [1, 1], [1, 1], [0, 0]],
+    return: [1, 1]"""
+    times_str_list = [tuple(e) for e in times_list]
+    times_str_list_counter = Counter(times_str_list)
+    most_frequent_times = times_str_list_counter.most_common(1)[0][0]
+    return most_frequent_times
+
+
+def get_proposals_for_single_desc_video_pair(single_data, proposal_fn, dset_name):
+    proposal_info = dict(
+        vid_name=single_data["vid_name"],
+        desc_id=single_data["desc_id"],
+        gt_ts=single_data["ts"] if dset_name != "didemo" else get_didemo_agreed_ts(single_data["ts"]),
+        proposals=proposal_fn(video_id="", metadata={"duration": single_data["duration"]}),
+    )
+    proposal_info["proposal_ious"] = compute_temporal_iou_batch(
+        proposal_info["proposals"], proposal_info["gt_ts"])
+    return proposal_info
+
+
+def get_proposals_for_videos(datalist, dset_name):
+    """datalist list(dict): each dict is
+    {"desc_id": str/int, "duration": float, "ts": [st (float), ed (float)], ...}
+    Note for Didemo dataset, "ts" entry is a list of [st (float), ed (float)] from different annotators,
+    here we use the most frequent ts, we break ties by randomly sample one
+    """
+    proposal_interface = get_proposal_interface(dset_name)
+    video_proposals_list = []
+    for e in tqdm(datalist, desc="Computing video proposals"):
+        video_proposals_list.append(
+            get_proposals_for_single_desc_video_pair(e, proposal_interface, dset_name))
+    return video_proposals_list
+
+
+def is_recalled_single_moment(proposal_ious, iou_thds=(0.5, 0.7)):
+    """
+    Args:
+        proposal_ious: np.ndarray, shape (N_proposal, )
+        iou_thds: set, temporal IoU thresholds
+
+    Returns:
+        list(bool), len == len(iou_thds), indicates whether recall under a iou_thd is found.
+    """
+    recalled = [False, ] * len(iou_thds)
+    for idx, iou_thd in enumerate(iou_thds):
+        recalled[idx] = np.sum(proposal_ious >= iou_thd) >= 1  # at least one
+    return recalled
+
+
+def compute_proposal_recall_upper_bound(video_proposals_list, iou_thds=(0.5, 0.7)):
+    """video_proposals_list from get_proposals_for_videos()"""
+    iou_corrects = np.empty((len(video_proposals_list), 2), dtype=np.float32)
+    for idx, d in tqdm(enumerate(video_proposals_list),
+                       desc="Computing recall for videos",
+                       total=len(video_proposals_list)):
+        iou_corrects[idx] = is_recalled_single_moment(d["proposal_ious"],
+                                                      iou_thds=iou_thds)
+    recall_by_iou = {iou_thd: float(np.mean(iou_corrects[:, idx]))
+                     for idx, iou_thd in enumerate(iou_thds)}
+    return recall_by_iou
+
+
+def main_compute_upper_bound():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-dset_name", type=str, choices=["tvr"])
+    parser.add_argument("-eval_file_path", type=str, help="path to the file containing data to be evaluated")
+    parser.add_argument("-save_path", type=str, help="path to save the results")
+    parser.add_argument("-verbose", action="store_true")
+    args = parser.parse_args()
+
+    eval_datalist = load_jsonl(args.eval_file_path)
+    video_proposals_list = get_proposals_for_videos(eval_datalist, args.dset_name)
+    recall_metrics = compute_proposal_recall_upper_bound(video_proposals_list, iou_thds=(0.5, 0.7))
+
+    video_proposals_list_by_video = {}
+    for p in video_proposals_list:
+        if p["vid_name"] in video_proposals_list_by_video:
+            continue
+        else:
+            video_proposals_list_by_video[p["vid_name"]] = p
+    video_proposals_list_by_video = list(video_proposals_list_by_video.values())
+    total_n_clips_in_proposals = \
+        np.sum([np.sum(e["proposals"][:, 1] - e["proposals"][:, 0]) for e in video_proposals_list_by_video])
+
+    results = dict(
+        avg_num_proposals=float(np.mean([len(e["proposals"]) for e in video_proposals_list_by_video])),
+        total_num_proposals=int(np.sum([len(e["proposals"]) for e in video_proposals_list_by_video])),
+        recall_metrics=recall_metrics,
+        dset_name=args.dset_name,
+        filename=args.eval_file_path,
+        proposal_config=ProposalConfigs[args.dset_name]
+    )
+    results["avg_clip_per_proposal"] = total_n_clips_in_proposals / results["total_num_proposals"]
+    save_json(results, args.save_path, save_pretty=True)
+    if args.verbose:
+        pprint.pprint(results)
+
+
+if __name__ == '__main__':
+    main_compute_upper_bound()
diff --git a/baselines/clip_alignment_with_language/local_utils/proposal.py b/baselines/clip_alignment_with_language/local_utils/proposal.py
new file mode 100644
index 0000000000000000000000000000000000000000..d81d32074f3e2f337b6ac52bb99ceb7ae869d09a
--- /dev/null
+++ b/baselines/clip_alignment_with_language/local_utils/proposal.py
@@ -0,0 +1,181 @@
+# MIT License
+#
+# Copyright (c) 2018 Victor Escorcia Castillo
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ==============================================================================
+"""
+Group multiple methods to generate salient temporal windows in a video"""
+import itertools
+import numpy as np
+
+PROPOSAL_SCHEMES = ['DidemoICCV17SS', 'SlidingWindowMSRSS']
+
+
+class TemporalProposalsBase:
+    """Base class (signature) to generate temporal candidate in video"""
+    def __call__(self, video_id, metadata=None, feature_collection=None):
+        raise NotImplementedError('Implement with the signature above')
+
+
+class DidemoICCV17SS(TemporalProposalsBase):
+    """Original search space of moments proposed in ICCV-2017
+
+    Attributes:
+        clip_length_min (float) : minimum length, in seconds, of a video clip.
+        proposals (numpy array) : of shape [21, 2] representing all the
+            possible temporal segments of valid annotations of DiDeMo dataset.
+            It represents the search space of a temporal localization
+            algorithm.
+
+    Reference: Hendricks et al. Localizing Moments in Video with Natural
+        Language. ICCV 2017.
+    """
+    clip_length_min = 5.0
+
+    def __init__(self, *args, dtype=np.float32, **kwargs):
+        clips_indices = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
+        for i in itertools.combinations(range(len(clips_indices)), 2):
+            clips_indices.append(i)
+        self.proposals = np.array(clips_indices, dtype=dtype)
+        self.proposals *= self.clip_length_min
+        self.proposals[:, 1] += self.clip_length_min
+
+    def __call__(self, *args, **kwargs):
+        return self.proposals
+
+
+class SlidingWindowMSRSS(TemporalProposalsBase):
+    """Multi-scale sliding window with relative stride within the same scale
+
+    Attributes:
+        length (float) : length of smallest window.
+        scales (sequence of int) : duration of moments relative to
+            `length`.
+        stride (float) : relative stride between two windows with the same
+            duration. We used different strides for each scale rounding it
+            towards a multiple of `length`. Note that the minimum stride is
+            `length` for any window will be the `length` itself.
+        dtype (numpy.dtype) :
+    """
+
+    def __init__(self, length, scales, stride=0.5, round_base=0.5, dtype=np.float32):
+        self.length = length
+        self.scales = scales
+        self.round_base = round_base
+        self.relative_stride = stride
+        # pick strides per scale that are multiples of length
+        self.strides = [max(round(s * stride / round_base) * round_base, round_base)
+                        * length for s in scales]
+        self.dtype = dtype
+        assert len(scales) > 0
+
+    def sliding_windows(self, t_end, t_start=0):
+        """sliding canonical windows over a given time interval"""
+        windows_ = []
+        for i, stride in enumerate(self.strides):
+            num_i = np.ceil((t_end - t_start) / stride)
+            windows_i = np.empty((int(num_i), 2), dtype=np.float32)
+            windows_i[:, 0] = np.arange(t_start, t_end, stride)
+            windows_i[:, 1] = windows_i[:, 0] + self.length * self.scales[i]
+            windows_i[windows_i[:, 1] > t_end, 1] = t_end
+            windows_.append(windows_i)
+            # print("--------------------------------{}".format(i))
+            # print(windows_i)
+        # import sys
+        # sys.exit(1)
+        windows = np.concatenate(windows_, axis=0)
+        # Hacky way to make windows fit inside video
+        # It implies windows at the end may not belong to the set spanned by
+        # length and scales.
+        return np.unique(windows, axis=0)
+
+    def __call__(self, video_id, metadata=None, feature_collection=None):
+        """return: (N_window, 2), each row contains (start, end)"""
+        duration = metadata.get('duration')
+        assert duration is not None
+        return self.sliding_windows(duration)
+
+
+ProposalConfigs = {
+    "didemo": {
+        "proposal_interface": "DidemoICCV17SS",
+        "clip_length": 2.5,
+    },
+    "tvr": {
+        "length": 3,  # min proposal length
+        "scales": [1, 2, 4, 8],
+        "stride": 0.3,
+        "round_base": 1,
+        "min_proposal_length": 3,  # length * min(scales)
+        "clip_length": 1.5,  # length should be divisible by clip_length
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+    "anet_cap": {
+        "length": 5,
+        "scales": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26],
+        "stride": 0.3,
+        "round_base": 1,
+        "min_proposal_length": 10,  # length * min(scales)
+        "clip_length": 5,  # length * min(scales) / 2
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+    "charades_sta": {
+        "length": 3,
+        "scales": [2, 3, 4, 5, 6, 7, 8],
+        "stride": 0.3,
+        "round_base": 1,
+        "min_proposal_length": 6,  # length * min(scales)
+        "clip_length": 3,  # length * min(scales) / 2
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+    "profiling": {
+        "length": 5,
+        "scales": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+        "stride": 0.3,
+        "round_base": 1,
+        "clip_length": 5,  # length * min(scales) / 2
+        "proposal_interface": "SlidingWindowMSRSS",
+    },
+}
+"""
+'min_clip_length' is used to uniformly segment the video into smaller clips, it is a half of
+the 'min_proposal_length'. Thus we can enforce each moment has at least 2 clips.
+"""
+
+
+def get_proposal_interface(dset_name):
+    """ dset_name (str): one of ["tvr"] """
+    assert dset_name in ProposalConfigs
+    if dset_name == "didemo":
+        return DidemoICCV17SS()
+    else:
+        arg_names = ["length", "scales", "stride", "round_base"]
+        func_args = {k: ProposalConfigs[dset_name][k] for k in arg_names}
+        return SlidingWindowMSRSS(**func_args)
+
+
+if __name__ == '__main__':
+    test_fns_args = [(DidemoICCV17SS, (),),
+                     (SlidingWindowMSRSS, (1.5, [2, 4, 6, 12]))]
+    for fn_i, args_i in test_fns_args:
+        proposal_fn = fn_i(*args_i)
+        x = proposal_fn('hola', {'duration': 15})
+        if fn_i == DidemoICCV17SS:
+            assert len(x) == 21
diff --git a/baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt b/baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..780ec601a2d1e3ed911bcdc5d92ddcff6be8592d
--- /dev/null
+++ b/baselines/clip_alignment_with_language/local_utils/tvr_proposal_test_log.txt
@@ -0,0 +1,61 @@
+
+"""
+{'avg_num_proposals': 158.30197338228544,
+ 'dset_name': 'tvr',
+ 'filename': 'data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 1,
+                     'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.8927030563354492, 0.7: 0.6690225005149841},
+ 'total_num_proposals': 344940}
+
+
+{'avg_num_proposals': 213.3295089490592,
+ 'dset_name': 'tvr',
+ 'filename': 'data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'min_clip_length': 1.5,
+                     'min_proposal_length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9612666368484497, 0.7: 0.8215695023536682},
+ 'total_num_proposals': 464845}
+ --
+
+
+{'avg_num_proposals': 213.3295089490592,
+ 'dset_name': 'tvr',
+ 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9612666368484497, 0.7: 0.8215695023536682}}
+
+
+{'avg_num_proposals': 263.3845800826067,
+ 'dset_name': 'tvr',
+ 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9841211438179016, 0.7: 0.8567232489585876}}
+
+
+{'avg_num_proposals': 242.97246443322626,
+ 'dset_name': 'tvr',
+ 'filename': '../../data/retrieval_release_data_with_ids/tvr_val_release.jsonl',
+ 'proposal_config': {'length': 3,
+                     'proposal_interface': 'SlidingWindowMSRSS',
+                     'round_base': 0.5,
+                     'scales': [0.5, 1, 2, 3, 4, 5, 6, 7, 8],
+                     'stride': 0.3},
+ 'recall_metrics': {0.5: 0.9608076810836792, 0.7: 0.8212941884994507}}
+"""
\ No newline at end of file
diff --git a/baselines/clip_alignment_with_language/mix_model_prediction.py b/baselines/clip_alignment_with_language/mix_model_prediction.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b74d37b62c954119e33eac58d79951c32b15165
--- /dev/null
+++ b/baselines/clip_alignment_with_language/mix_model_prediction.py
@@ -0,0 +1,86 @@
+"""
+Implement the CAL + CAL (TEF) model mentioned in
+```
+@article{Escorcia2019TemporalLO,
+  title={Temporal Localization of Moments in Video Collections with Natural Language},
+  author={Victor Escorcia and Mattia Soldan and Josef Sivic and Bernard Ghanem and Bryan Russell},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1907.12763}
+}
+```
+
+Methods:
+    1, Give top200 predictions for each query in CAL then using CAL (TEF) to re-rank.
+    2, This is approximated by re-ranking the top200 CAL using top1000 CAL(TEF) -- we assume they will be all covered.
+"""
+
+import torch
+import subprocess
+import numpy as np
+from tqdm import tqdm
+from utils.basic_utils import load_json, save_json
+
+
+def load_saved_res(pred_path):
+    if pred_path.endswith(".json"):
+        pred = load_json(pred_path)
+    else:
+        pred = torch.load(pred_path)
+    vcmr_res = {e["desc_id"]: e for e in pred["VCMR"]}
+    video2idx = pred["video2idx"]
+    return vcmr_res, video2idx
+
+
+def main_mix_results(pred_path, tef_pred_path, save_path, max_after_nms=100):
+    """
+    Args:
+        pred_path: contains top-200 VCMR predictions
+        tef_pred_path: contains top-1000 VCMR predictions
+        save_path:
+        max_after_nms: int,
+    Returns:
+        save
+    """
+    vcmr_res, video2idx = load_saved_res(pred_path)
+    tef_vcmr_res, video2idx = load_saved_res(tef_pred_path)
+
+    reranked_vcmr_res = {}
+    num_valid = []
+    for desc_id, preds in tqdm(vcmr_res.items(), desc="Loop over the predictions"):
+        tef_preds = tef_vcmr_res[desc_id]["predictions"]
+        pred_moments = set([tuple(e[:3]) for e in preds["predictions"]])
+        reranked_moments = [e for e in tef_preds if tuple(e[:3]) in pred_moments][:max_after_nms]
+        num_valid += [len(reranked_moments)]
+        if len(reranked_moments) != 100:
+            reranked_moments += reranked_moments[:100 - len(reranked_moments)]
+        reranked_vcmr_res[desc_id] = dict(
+            predictions=reranked_moments,
+            desc_id=desc_id,
+            desc=preds["desc"]
+        )
+
+    print("There are {} moments founded on average".format(np.mean(num_valid)))
+    reranked_predictions = dict(
+        VCMR=list(reranked_vcmr_res.values()),
+        video2idx=video2idx
+    )
+
+    save_json(reranked_predictions, save_path)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pred_path", type=str, help="path to prediction res")
+    parser.add_argument("--tef_pred_path", type=str, help="path to TEF prediction res")
+    parser.add_argument("--save_path", type=str, help="path to save the re-ranked predictions, same dir as --pred_path")
+    parser.add_argument("--gt_path", type=str, help="path to ground truth file")
+    args = parser.parse_args()
+
+    main_mix_results(args.pred_path, args.tef_pred_path, args.save_path)
+
+    metrics_path = args.save_path.replace(".json", "_metrics.json")
+    eval_cmd = "python standalone_eval/eval.py --submission_path " + args.save_path + " --gt_path " + args.gt_path + \
+        " --save_path " + metrics_path
+    results = subprocess.run(eval_cmd, shell=True)
diff --git a/baselines/clip_alignment_with_language/model.py b/baselines/clip_alignment_with_language/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e7ec39f34a60ec8fa7be070fdd24659e536a6c
--- /dev/null
+++ b/baselines/clip_alignment_with_language/model.py
@@ -0,0 +1,299 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.model_utils import RNNEncoder
+from easydict import EasyDict as edict
+
+
+cal_base_cfg = edict(
+    visual_input_size=2048,  # changes based on visual input type
+    textual_input_size=768,
+    query_feat_size=768,
+    visual_hidden_size=500,  #
+    output_size=100,
+    embedding_size=768,
+    lstm_hidden_size=1000,
+    margin=0.1,  # margin for ranking loss
+    loss_type="hinge",  # loss type, 'hinge' or 'lse'
+    inter_loss_weight=0.4,  # weight for inter negatives
+    ctx_mode="video"
+)
+
+
+class CAL(nn.Module):
+    def __init__(self, config):
+        super(CAL, self).__init__()
+        self.config = config
+
+        self.moment_mlp = nn.Sequential(
+            nn.Linear(config.visual_input_size, config.visual_hidden_size),
+            nn.ReLU(True),
+            nn.Linear(config.visual_hidden_size, config.output_size),
+        )
+
+        self.query_lstm = RNNEncoder(word_embedding_size=config.embedding_size,
+                                     hidden_size=config.lstm_hidden_size,
+                                     bidirectional=False,
+                                     rnn_type="lstm",
+                                     dropout_p=0,
+                                     n_layers=1,
+                                     return_outputs=False)
+
+        self.query_linear = nn.Linear(config.lstm_hidden_size, config.output_size)
+
+    def moment_encoder(self, moment_feat):
+        """moment_feat: (N, L_clip, D_v)"""
+        return F.normalize(self.moment_mlp(moment_feat), p=2, dim=-1)  # (N, L_clip, D_o)
+
+    def query_encoder(self, query_feat, query_mask):
+        """
+        Args:
+            query_feat: (N, L_q, D_q), torch.float32
+            query_mask: (N, L_q), torch.float32, with 1 indicates valid query, 0 indicates mask
+        """
+        _, hidden = self.query_lstm(query_feat, torch.sum(query_mask, dim=1).long())
+        return F.normalize(self.query_linear(hidden), p=2, dim=-1)  # (N, D_o)
+
+    def compute_pdist(self, query_embedding, moment_feat, moment_mask):
+        """ pairwise L2 distance
+        Args:
+            query_embedding: (N, D_o)
+            moment_feat: (N, L_clip, D_v)
+            moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding
+        """
+        moment_embedding = self.moment_encoder(moment_feat)  # (N, L_clip, D_o)
+        moment_clip_dist = torch.sum((moment_embedding - query_embedding.unsqueeze(1)) ** 2, dim=2)  # (N, L_clip)
+        moment_dist = torch.sum(moment_clip_dist * moment_mask, dim=1) / moment_mask.sum(1)  # (N, )
+        return moment_dist  # (N, )
+
+    @classmethod
+    def compute_cdist_inference(cls, query_embeddings, moment_embeddings, moment_mask):
+        """ Compute L2 distance for every possible pair of queries and proposals. This is different from
+        compute_pdist as the latter computes only pairs at each row.
+        Args:
+            query_embeddings: (N_q, D_o)
+            moment_embeddings: (N_prop, N_clips, D_o)
+            moment_mask: (N_prop, N_clips)
+        return:
+            query_moment_scores: (N_q, N_prop)
+        """
+        # sync device
+        query_device = query_embeddings.device  # convert to cuda if we want to use GPU
+        if moment_embeddings.device != query_device:
+            moment_embeddings = moment_embeddings.to(query_device)
+            moment_mask = moment_mask.to(query_device)
+
+        # compute
+        n_query = query_embeddings.shape[0]
+        n_prop, n_clips, d = moment_embeddings.shape
+        query_clip_dist = torch.cdist(
+            query_embeddings, moment_embeddings.reshape(-1, d), p=2) ** 2  # (N_q, N_prop * N_clips)
+        query_clip_dist = query_clip_dist.reshape(n_query, n_prop, n_clips)
+        query_moment_dist = torch.sum(
+            query_clip_dist * moment_mask.unsqueeze(0), dim=2) / moment_mask.sum(1).unsqueeze(0)
+        return query_moment_dist  # (N_q, N_prop)
+
+    def forward(self, query_feat, query_mask, pos_moment_feat, pos_moment_mask,
+                intra_neg_moment_feat, intra_neg_moment_mask,
+                inter_neg_moment_feat, inter_neg_moment_mask):
+        """
+        Args:
+            query_feat: (N, L, D_q)
+            query_mask: (N, L)
+            pos_moment_feat: (N, L_clip_1, D_v)
+            pos_moment_mask: (N, L_clip_1)
+            intra_neg_moment_feat: (N, L_clip_2, D_v)
+            intra_neg_moment_mask: (N, L_clip_2)
+            inter_neg_moment_feat: (N, L_clip_3, D_v)
+            inter_neg_moment_mask: (N, L_clip_2)
+        """
+        query_embed = self.query_encoder(query_feat, query_mask)  # (N, D_o)
+        pos_dist = self.compute_pdist(query_embed, pos_moment_feat, pos_moment_mask)  # (N, )
+        intra_neg_dist = self.compute_pdist(query_embed, intra_neg_moment_feat, intra_neg_moment_mask)  # (N, )
+        if self.config.inter_loss_weight == 0:  # should be zero for tef_only method.
+            loss_inter = 0.
+        else:
+            inter_neg_dist = self.compute_pdist(query_embed, inter_neg_moment_feat, inter_neg_moment_mask)  # (N, )
+            loss_inter = self.calc_loss(pos_dist, inter_neg_dist)
+
+        loss = self.calc_loss(pos_dist, intra_neg_dist) + self.config.inter_loss_weight * loss_inter
+        return loss
+
+    def calc_loss(self, pos_dist, neg_dist):
+        """ Note here we encourage positive distance to be smaller than negative distance.
+        Args:
+            pos_dist: (N, ), torch.float32
+            neg_dist: (N, ), torch.float32
+        """
+        if self.config.loss_type == "hinge":  # max(0, m + S_pos - S_neg)
+            return torch.clamp(self.config.margin + pos_dist - neg_dist, min=0).sum() / len(pos_dist)
+        elif self.config.loss_type == "lse":  # log[1 + exp(S_pos - S_neg)]
+            return torch.log1p(torch.exp(pos_dist - neg_dist)).sum() / len(pos_dist)
+        else:
+            raise NotImplementedError("Only support 'hinge' and 'lse'")
+
+
+class CALWithSub(nn.Module):
+    def __init__(self, config):
+        super(CALWithSub, self).__init__()
+        self.config = config
+        self.use_video = "video" in config.ctx_mode
+        self.use_sub = "sub" in config.ctx_mode
+        self.use_tef = "tef" in config.ctx_mode
+        self.tef_only = self.use_tef and not self.use_video and not self.use_sub
+
+        if self.use_video or self.tef_only:
+            self.video_moment_mlp = nn.Sequential(
+                nn.Linear(config.visual_input_size, config.visual_hidden_size),
+                nn.ReLU(True),
+                nn.Linear(config.visual_hidden_size, config.output_size),
+            )
+
+        if self.use_sub:
+            self.sub_moment_mlp = nn.Sequential(
+                nn.Linear(config.textual_input_size, config.visual_hidden_size),
+                nn.ReLU(True),
+                nn.Linear(config.visual_hidden_size, config.output_size),
+            )
+
+        self.query_lstm = RNNEncoder(word_embedding_size=config.query_feat_size,
+                                     hidden_size=config.lstm_hidden_size,
+                                     bidirectional=False,
+                                     rnn_type="lstm",
+                                     dropout_p=0,
+                                     n_layers=1,
+                                     return_outputs=False)
+
+        self.query_linear = nn.Linear(config.lstm_hidden_size, config.output_size)
+
+    def moment_encoder(self, moment_feat, module_name="video"):
+        """moment_feat: (N, L_clip, D_v)"""
+        if moment_feat is not None:
+            encoder = getattr(self, module_name + "_moment_mlp")
+            return F.normalize(encoder(moment_feat), p=2, dim=-1)  # (N, L_clip, D_o)
+        else:
+            return None
+
+    def query_encoder(self, query_feat, query_mask):
+        """
+        Args:
+            query_feat: (N, L_q, D_q), torch.float32
+            query_mask: (N, L_q), torch.float32, with 1 indicates valid query, 0 indicates mask
+        """
+        _, hidden = self.query_lstm(query_feat, torch.sum(query_mask, dim=1).long())
+        return F.normalize(self.query_linear(hidden), p=2, dim=-1)  # (N, D_o)
+
+    def _compute_pdist(self, query_embedding, moment_feat, moment_mask, module_name="video"):
+        """ pairwise L2 distance
+        Args:
+            query_embedding: (N, D_o)
+            moment_feat: (N, L_clip, D_v)
+            moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding
+        """
+        moment_embedding = self.moment_encoder(moment_feat, module_name=module_name)  # (N, L_clip, D_o)
+        moment_clip_dist = torch.sum((moment_embedding - query_embedding.unsqueeze(1)) ** 2, dim=2)  # (N, L_clip)
+        moment_dist = torch.sum(moment_clip_dist * moment_mask, dim=1) / moment_mask.sum(1)  # (N, )
+        return moment_dist  # (N, )
+
+    def compute_pdist(self, query_embedding, moment_video_feat, moment_sub_feat, moment_mask):
+        """ pairwise L2 distance
+        Args:
+            query_embedding: (N, D_o)
+            moment_video_feat: (N, L_clip, D_v)
+            moment_sub_feat: (N, L_clip, D_t)
+            moment_mask: (N, L_clip), torch.float32, where 1 indicates valid, 0 indicates padding
+        """
+        divisor = (self.use_video or self.tef_only) + self.use_sub
+        video_moment_dist = self._compute_pdist(query_embedding, moment_video_feat, moment_mask, module_name="video") \
+            if self.use_video or self.tef_only else 0
+        sub_moment_dist = self._compute_pdist(query_embedding, moment_sub_feat, moment_mask, module_name="sub") \
+            if self.use_sub else 0
+        return (video_moment_dist + sub_moment_dist) / divisor  # (N, )
+
+    def _compute_cdist_inference(self, query_embeddings, moment_embeddings, moment_mask):
+        """ Compute L2 distance for every possible pair of queries and proposals. This is different from
+        compute_pdist as the latter computes only pairs at each row.
+        Args:
+            query_embeddings: (N_q, D_o)
+            moment_embeddings: (N_prop, N_clips, D_o)
+            moment_mask: (N_prop, N_clips)
+        return:
+            query_moment_scores: (N_q, N_prop)
+        """
+        # sync device
+        query_device = query_embeddings.device  # convert to cuda if we want to use GPU
+        if moment_embeddings.device != query_device:
+            moment_embeddings = moment_embeddings.to(query_device)
+            moment_mask = moment_mask.to(query_device)
+
+        # compute
+        n_query = query_embeddings.shape[0]
+        n_prop, n_clips, d = moment_embeddings.shape
+        query_clip_dist = torch.cdist(
+            query_embeddings, moment_embeddings.reshape(-1, d), p=2) ** 2  # (N_q, N_prop * N_clips)
+        query_clip_dist = query_clip_dist.reshape(n_query, n_prop, n_clips)
+        query_moment_dist = torch.sum(
+            query_clip_dist * moment_mask.unsqueeze(0), dim=2) / moment_mask.sum(1).unsqueeze(0)
+        return query_moment_dist  # (N_q, N_prop)
+
+    def compute_cdist_inference(self, query_embeddings, video_moment_embeddings, sub_moment_embeddings, moment_mask):
+        divisor = (self.use_video or self.tef_only) + self.use_sub
+        video_moment_dist = self._compute_cdist_inference(query_embeddings, video_moment_embeddings, moment_mask) \
+            if self.use_video or self.tef_only else 0
+        sub_moment_dist = self._compute_cdist_inference(query_embeddings, sub_moment_embeddings, moment_mask) \
+            if self.use_sub else 0
+        return (video_moment_dist + sub_moment_dist) / divisor  # (N_q, N_prop)
+
+    def forward(self, query_feat, query_mask, pos_moment_video_feat, pos_moment_video_mask,
+                intra_neg_moment_video_feat, intra_neg_moment_video_mask,
+                inter_neg_moment_video_feat, inter_neg_moment_video_mask,
+                pos_moment_sub_feat, pos_moment_sub_mask,
+                intra_neg_moment_sub_feat, intra_neg_moment_sub_mask,
+                inter_neg_moment_sub_feat, inter_neg_moment_sub_mask):
+        """
+        Args:
+            query_feat: (N, L, D_q)
+            query_mask: (N, L)
+            pos_moment_video_feat: (N, L_clip_1, D_v)
+            pos_moment_video_mask: (N, L_clip_1)
+            intra_neg_moment_video_feat: (N, L_clip_2, D_v)
+            intra_neg_moment_video_mask: (N, L_clip_2)
+            inter_neg_moment_video_feat: (N, L_clip_3, D_v)
+            inter_neg_moment_video_mask: (N, L_clip_2)
+            pos_moment_sub_feat:
+            pos_moment_sub_mask:
+            intra_neg_moment_sub_feat:
+            intra_neg_moment_sub_mask:
+            inter_neg_moment_sub_feat:
+            inter_neg_moment_sub_mask:
+        """
+        query_embed = self.query_encoder(query_feat, query_mask)  # (N, D_o)
+        pos_dist = self.compute_pdist(
+            query_embed, pos_moment_video_feat, pos_moment_sub_feat,
+            moment_mask=pos_moment_sub_mask if self.use_sub else pos_moment_video_mask)  # (N, )
+        intra_neg_dist = self.compute_pdist(
+            query_embed, intra_neg_moment_video_feat, intra_neg_moment_sub_feat,
+            moment_mask=intra_neg_moment_sub_mask if self.use_sub else intra_neg_moment_video_mask)  # (N, )
+        if self.config.inter_loss_weight == 0:  # should be zero for tef_only method.
+            loss_inter = 0.
+        else:
+            inter_neg_dist = self.compute_pdist(
+                query_embed, inter_neg_moment_video_feat, inter_neg_moment_sub_feat,
+                moment_mask=inter_neg_moment_sub_mask if self.use_sub else inter_neg_moment_video_mask)  # (N, )
+            loss_inter = self.calc_loss(pos_dist, inter_neg_dist)
+
+        loss = self.calc_loss(pos_dist, intra_neg_dist) + self.config.inter_loss_weight * loss_inter
+        return loss
+
+    def calc_loss(self, pos_dist, neg_dist):
+        """ Note here we encourage positive distance to be smaller than negative distance.
+        Args:
+            pos_dist: (N, ), torch.float32
+            neg_dist: (N, ), torch.float32
+        """
+        if self.config.loss_type == "hinge":  # max(0, m + S_pos - S_neg)
+            return torch.clamp(self.config.margin + pos_dist - neg_dist, min=0).sum() / len(pos_dist)
+        elif self.config.loss_type == "lse":  # log[1 + exp(S_pos - S_neg)]
+            return torch.log1p(torch.exp(pos_dist - neg_dist)).sum() / len(pos_dist)
+        else:
+            raise NotImplementedError("Only support 'hinge' and 'lse'")
diff --git a/baselines/clip_alignment_with_language/proposal_retrieval_dataset.py b/baselines/clip_alignment_with_language/proposal_retrieval_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcbd86c61b565d6654db854e33fa9ef4b623a906
--- /dev/null
+++ b/baselines/clip_alignment_with_language/proposal_retrieval_dataset.py
@@ -0,0 +1,587 @@
+"""
+Dataset for clip model
+"""
+import logging
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import h5py
+import math
+import random
+from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array
+from utils.tensor_utils import pad_sequences_1d
+from baselines.clip_alignment_with_language.local_utils.proposal import get_proposal_interface
+from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \
+    get_didemo_agreed_ts
+from standalone_eval.eval import compute_temporal_iou_batch
+
+logger = logging.getLogger(__name__)
+
+
+class ProposalRetrievalDataset(Dataset):
+    """
+    Args:
+        dset_name, str, ["tvr"]
+        ctx_mode: str,
+        pos_iou_thd: float, in [0, 1], >= pos_iou_thd are defined as positive
+        neg_iou_thd: float, in [0, 1], < neg_iou_thd are defined as negative
+    Return:
+        a dict: {
+            "meta": {
+                "desc_id": int,
+                "desc": str,
+                "vid_name": str,
+                "duration": float,
+                "ts": [st (float), ed (float)], seconds, ground_truth timestamps
+                "pos_moment": [st (float), ed (float)], seconds, IoU with "ts" >= pos_iou_thd
+                "intra_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd
+                "inter_neg_vid_name": str,
+                "inter_neg_duration": float,
+                "inter_neg_moment": [st (float), ed (float)], seconds, IoU with "ts" < neg_iou_thd
+            }
+            "model_inputs": {
+                "desc_feat": torch.tensor, (L, D_t)
+                "pos_moment_feat": torch.tensor, (n_clip_in_moment, D)
+                "intra_neg_moment_feat": torch.tensor, (n_clip_in_moment, D)
+                "inter_neg_moment_feat": torch.tensor, (n_clip_in_moment, D)
+            }
+        }
+    """
+    def __init__(self, dset_name, data_path, desc_bert_path, sub_bert_path, max_desc_len,
+                 vid_feat_path, clip_length, vid_feat_size, sub_feat_size=0, ctx_mode="video_tef",
+                 pos_iou_thd=0.7, neg_iou_thd=0.3, h5driver=None, data_ratio=1.0,
+                 normalize_vfeat=True, normalize_tfeat=True, model_type="cal",
+                 external_train_vr_res_path=None, corpus_path=None):
+        self.dset_name = dset_name
+        self.model_type = model_type
+        self.pool_local = model_type == "mcn"  # pool local feature
+        self.data_path = data_path
+        self.data_ratio = data_ratio
+
+        self.desc_bert_path = desc_bert_path
+        self.max_desc_len = max_desc_len
+        self.sub_bert_path = sub_bert_path
+
+        self.vid_feat_path = vid_feat_path
+        self.clip_length = clip_length
+        self.ctx_mode = ctx_mode
+
+        self.pos_iou_thd = pos_iou_thd
+        self.neg_iou_thd = neg_iou_thd
+
+        self.vid_feat_output_size = 2 * vid_feat_size * ("video" in ctx_mode) + 2 * ("tef" in ctx_mode)
+        self.sub_feat_output_size = 2 * sub_feat_size * ("sub" in ctx_mode) + 2 * ("tef" in ctx_mode)
+
+        # prepare desc data
+        self.data = load_jsonl(data_path)
+        if self.data_ratio != 1:
+            n_examples = int(len(self.data) * data_ratio)
+            self.data = self.data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+
+        self.proposal_fn = get_proposal_interface(dset_name)
+        if self.ctx_mode != "tef":
+            self.vid_feat_h5 = h5py.File(self.vid_feat_path, "r", driver=h5driver)
+        self.desc_bert_h5 = h5py.File(self.desc_bert_path, "r", driver=h5driver)
+        if "sub" in self.ctx_mode:
+            self.sub_bert_h5 = h5py.File(self.sub_bert_path, "r", driver=h5driver)
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if external_train_vr_res_path is not None:
+            video_data = load_json(corpus_path)["train"]
+            # {video_idx: [vid_name, vid_duration]}
+            video_idx2name_dur_pair = {v[1]: [k, v[0]] for k, v in video_data.items()}
+            external_vr_res = load_json(external_train_vr_res_path)
+            # {desc_id: [(vid_name, vid_duration), ...]}
+            self.desc_id2video_names_dur_pairs = \
+                {e["desc_id"]: [video_idx2name_dur_pair[int(sub_e[0])] for sub_e in e["predictions"]]
+                 for e in external_vr_res["VR"]}  # ordered
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        raw_data = self.data[index]
+
+        # initialize with basic data
+        meta = dict(
+            desc_id=raw_data["desc_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+            ts=raw_data["ts"] if self.dset_name != "didemo" else get_didemo_agreed_ts(raw_data["ts"]),
+        )
+        model_inputs = dict()
+        query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        model_inputs["query_feat"] = torch.from_numpy(query_feat)
+
+        # sample positive and negative moments
+        meta["pos_moment"] = self.align_ts_to_clip_boundaries(meta["duration"], meta["ts"])
+        meta["intra_neg_moment"] = self.sample_intra_neg_moment(meta["duration"], meta["ts"])
+        meta["inter_neg_moment"], meta["inter_neg_vid_name"], meta["inter_neg_duration"] = \
+            self.sample_inter_video_negative(meta["vid_name"], meta["pos_moment"] / meta["duration"],
+                                             desc_id=meta["desc_id"])
+
+        pos_tef, intra_neg_tef, inter_neg_tef = (None,) * 3
+        if self.use_tef:
+            pos_tef = meta["pos_moment"] / meta["duration"]  # temporal endpoint feature, (2, )
+            intra_neg_tef = meta["intra_neg_moment"] / meta["duration"]
+            inter_neg_tef = meta["inter_neg_moment"] / meta["inter_neg_duration"]
+
+        if self.use_video:
+            pos_v_feat = self.vid_feat_h5[meta["vid_name"]]  # (N_frm, D)
+            neg_v_feat = self.vid_feat_h5[meta["inter_neg_vid_name"]]
+            pos_v_ctx_feat = np.mean(pos_v_feat, axis=0)
+            neg_v_ctx_feat = np.mean(neg_v_feat, axis=0)
+            if self.normalize_vfeat:
+                pos_v_ctx_feat = l2_normalize_np_array(pos_v_ctx_feat)
+                neg_v_ctx_feat = l2_normalize_np_array(neg_v_ctx_feat)
+            pos_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["pos_moment"],
+                                                     normalize=self.normalize_vfeat,
+                                                     fix_outbound=True, pool_local=self.pool_local)
+            intra_neg_moment_v_feat = self.get_moment_feat(pos_v_feat, meta["intra_neg_moment"],
+                                                           normalize=self.normalize_vfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+            inter_neg_moment_v_feat = self.get_moment_feat(neg_v_feat, meta["inter_neg_moment"],
+                                                           normalize=self.normalize_vfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+
+            # concat features, [video_clip_feat; video_context_feat; temporal_endpoint_feat]
+            model_inputs["pos_moment_video_feat"] = self.concat_feat_adv(
+                moment_feats=[pos_moment_v_feat, pos_v_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode)
+            model_inputs["intra_neg_moment_video_feat"] = self.concat_feat_adv(
+                moment_feats=[intra_neg_moment_v_feat, pos_v_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode)
+            model_inputs["inter_neg_moment_video_feat"] = self.concat_feat_adv(
+                moment_feats=[inter_neg_moment_v_feat, neg_v_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode)
+        else:
+            for k in ["pos_moment_video_feat", "intra_neg_moment_video_feat", "inter_neg_moment_video_feat"]:
+                model_inputs[k] = torch.zeros((2, 2))
+
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            pos_s_feat = self.sub_bert_h5[meta["vid_name"]]  # (N_words, D_t)
+            neg_s_feat = self.sub_bert_h5[meta["inter_neg_vid_name"]]
+            pos_s_ctx_feat = np.mean(pos_s_feat, axis=0)
+            neg_s_ctx_feat = np.mean(neg_s_feat, axis=0)
+            if self.normalize_tfeat:
+                pos_s_ctx_feat = l2_normalize_np_array(pos_s_ctx_feat)
+                neg_s_ctx_feat = l2_normalize_np_array(neg_s_ctx_feat)
+            pos_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["pos_moment"],
+                                                     normalize=self.normalize_tfeat,
+                                                     fix_outbound=True, pool_local=self.pool_local)
+            intra_neg_moment_s_feat = self.get_moment_feat(pos_s_feat, meta["intra_neg_moment"],
+                                                           normalize=self.normalize_tfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+            inter_neg_moment_s_feat = self.get_moment_feat(neg_s_feat, meta["inter_neg_moment"],
+                                                           normalize=self.normalize_tfeat,
+                                                           fix_outbound=True, pool_local=self.pool_local)
+
+            # concat features, [sub_clip_feat; sub_context_feat; temporal_endpoint_feat]
+            model_inputs["pos_moment_sub_feat"] = self.concat_feat_adv(
+                moment_feats=[pos_moment_s_feat, pos_s_ctx_feat], tef=pos_tef, ctx_mode=self.ctx_mode)
+            model_inputs["intra_neg_moment_sub_feat"] = self.concat_feat_adv(
+                moment_feats=[intra_neg_moment_s_feat, pos_s_ctx_feat], tef=intra_neg_tef, ctx_mode=self.ctx_mode)
+            model_inputs["inter_neg_moment_sub_feat"] = self.concat_feat_adv(
+                moment_feats=[inter_neg_moment_s_feat, neg_s_ctx_feat], tef=inter_neg_tef, ctx_mode=self.ctx_mode)
+        else:
+            for k in ["pos_moment_sub_feat", "intra_neg_moment_sub_feat", "inter_neg_moment_sub_feat"]:
+                model_inputs[k] = torch.zeros((2, 2))
+
+        if not self.use_sub and not self.use_video and self.use_tef:  # use video stream
+            model_inputs["pos_moment_video_feat"] = \
+                self.concat_feat_adv(tef=pos_tef, ctx_mode=self.ctx_mode)
+            model_inputs["intra_neg_moment_video_feat"] = \
+                self.concat_feat_adv(tef=intra_neg_tef, ctx_mode=self.ctx_mode)
+            model_inputs["inter_neg_moment_video_feat"] = \
+                self.concat_feat_adv(tef=inter_neg_tef, ctx_mode=self.ctx_mode)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def align_ts_to_clip_boundaries(self, duration, ts):
+        """  # TODO Do we really need this???
+        Generate a moment [st, ed] that is most close to a clip boundary,
+        st and ed must be a multiple of self.clip_length, and ed <= duration
+        duration: float,
+        ts: [st (float), ed (float)], ground_truth ts
+        """
+        clip_aligned_ts = np.array([math.floor(ts[0] / self.clip_length),
+                                    math.ceil(ts[1] / self.clip_length)]) * self.clip_length
+        clip_aligned_ts[1] = min(clip_aligned_ts[1], duration)
+        return clip_aligned_ts
+
+    def sample_intra_neg_moment(self, duration, ts):
+        """ Generate a intra negative moment given the video duration and the GT ts.
+        The returned moment will be aligned to clip boundaries.
+        1) neg_moment has at least 2 clips
+        2) its iou with ts should be < self.neg_iou_thd
+        Args:
+            duration: float
+            ts: [st (float), ed (float)], ground_truth ts
+
+        Returns:
+
+        """
+        max_n_search = 5  # search at most max_n_search times, so the program will not be stuck in infinite loops.
+        sampled_moments = self.sample_ts_at_clip_boundaries(duration, n_pairs=max_n_search)  # (n_pairs, 2)
+        sampled_moments_ious = compute_temporal_iou_batch(sampled_moments, ts)  # (n_pairs, )
+        smallest_iou_idx = np.argmin(sampled_moments_ious)
+        sampled_moment = sampled_moments[smallest_iou_idx]
+        # only a small number (<20 with max_n_search==10) of samples are wrong,
+        # usually when the video_duration is too short.
+        # if sampled_moments_ious[smallest_iou_idx] >= self.neg_iou_thd:
+        #     logger.warning("the sampled intra-neg might be wrong. "
+        #                    "v_dur {}, ts {}, sampled neg moment {}, iou {}"
+        #                    .format(duration, ts, sampled_moment, sampled_moments_ious[smallest_iou_idx]))
+        return sampled_moment
+
+    def sample_ts_at_clip_boundaries(self, duration, n_pairs=1):
+        """sample n_pairs moment at clip boundaries, each has at least two clips."""
+        # '+ self.clip_length' since we assume indexing using [clip_st_idx, clip_ed_idx),
+        moments = np.random.randint(0, np.ceil(duration / self.clip_length), size=(n_pairs, 2))
+        moments = np.sort(moments, axis=1) * self.clip_length
+        less_equal = moments[:, 1] - moments[:, 0] <= self.clip_length
+        start_zero = moments[:, 0] == 0
+        moments[:, 1][less_equal * start_zero] += self.clip_length
+        moments[:, 0][less_equal * (start_zero == False)] -= self.clip_length  # keep as bool!!!
+        return moments
+
+    def sample_inter_video_negative(self, pos_vid_name, normalized_pos_moment, desc_id=None):
+        """Sample a negative moment --> negative video + similar normalized moment.
+        1) they are not from the same video
+        Args:
+            pos_vid_name: str,
+            normalized_pos_moment: np.ndarray, (2, ), value in [0, 1], normalized by duration.
+            desc_id: str
+        Returns:
+            moment: np.ndarray, (2, ), ts aligned to clip boundaries.
+
+        """
+        use_guided_negative = hasattr(self, "desc_id2video_names_dur_pairs")
+        if use_guided_negative:
+            top_videos = self.desc_id2video_names_dur_pairs[desc_id]
+            max_idx = len(top_videos) - 1
+
+        while True:  # usually only run once.
+            if use_guided_negative:
+                sampled_idx = min(max_idx, int(random.expovariate(0.1)))
+                sampled_video_name, sampled_video_dur = top_videos[sampled_idx]
+            else:
+                neg_vid_data = self.data[int(random.random() * len(self))]
+                sampled_video_name, sampled_video_dur = neg_vid_data["vid_name"], neg_vid_data["duration"]
+            if sampled_video_name != pos_vid_name:
+                inter_neg_moment = self.align_ts_to_clip_boundaries(
+                    sampled_video_dur, sampled_video_dur * normalized_pos_moment)
+                break
+
+        return inter_neg_moment, sampled_video_name, sampled_video_dur
+
+    @classmethod
+    def get_clip_indices_from_moments(cls, moment, clip_length):
+        clip_st_ed_indices = moment / clip_length
+        return math.floor(clip_st_ed_indices[0]), math.ceil(clip_st_ed_indices[1])
+
+    def get_moment_feat(self, vid_feat, moment, normalize=True, fix_outbound=False, pool_local=False):
+        """Each moment contains multiple clips.
+        Inside means [moment[0], moment[1]] (seconds)
+        Args:
+            vid_feat: np.ndarray, (N_clips, D)
+            moment: [st (float), ed (float)], np.ndarray
+            normalize: L2 normalize features
+            fix_outbound: bool,
+            pool_local: whether to mean pool the features
+        Returns:
+            moment_feature: np.ndarray, ((moment[1] - moment[0]) / clip_length, D) or (D, )
+        """
+        clip_st_idx, clip_ed_idx = self.get_clip_indices_from_moments(moment, self.clip_length)
+        if fix_outbound:
+            vid_feat_len = len(vid_feat)
+            if clip_st_idx >= vid_feat_len:
+                clip_st_idx = vid_feat_len - 2
+        moment_feat = vid_feat[clip_st_idx:clip_ed_idx]  # indexed as [st, ed)
+        if pool_local:
+            moment_feat = np.mean(moment_feat, axis=0, keepdims=True)
+        if normalize:
+            moment_feat = l2_normalize_np_array(moment_feat)
+        return moment_feat  # (n_clip_in_moment, D) or (D, )
+
+    @classmethod
+    def concat_feat_adv(cls, moment_feats=None, tef=None, to_torch=True, ctx_mode="tef"):
+        """ Concat moment_feat with other_feats and tef. All the features should be L2 normalized before concatenating
+        Args:
+            moment_feats: list of feats, one of them might be None. Other possible values are
+                ctx_feat (D, ) or sub(vid)_moment_feat (N_p, N_clips, D_t) or (N_clips, D_t).
+                The first non-None feature array is used as base for the rest to concatenate with.
+            tef: (N_p, 2) or (2, ), np.ndarray
+            to_torch: convert resulting np.ndarray to torch.tensor
+            ctx_mode:
+        """
+        if ctx_mode == "tef":
+            assembled_feat = np.expand_dims(tef, axis=-2)
+        else:  # concat moment_feat with all other_feats
+            moment_feats = [e for e in moment_feats if e is not None]  # remove possible None (placeholder)
+            extra_dims = moment_feats[0].shape[:-1]  # all others will need to broadcast to match it.
+            if isinstance(extra_dims, int):  # happens when len(moment_feat.shape) == 2
+                extra_dims = (extra_dims, )
+            last_dim_lengths = [0, ] + [e.shape[-1] for e in moment_feats]
+            if "tef" in ctx_mode:  # add tef
+                last_dim_lengths += [2, ]
+                moment_feats += [np.expand_dims(tef, axis=-2), ]
+
+            if len(moment_feats) > 1:
+                assembled_feat = np.empty(extra_dims + (sum(last_dim_lengths), ), dtype=np.float32)
+                last_dim_lengths_cumsum = [sum(last_dim_lengths[0:idx+1]) for idx in range(len(last_dim_lengths))]
+                for idx, feat in enumerate(moment_feats):
+                    assembled_feat[..., last_dim_lengths_cumsum[idx]:last_dim_lengths_cumsum[idx+1]] = feat
+            else:
+                assembled_feat = moment_feats[0]
+
+        if to_torch:
+            return torch.from_numpy(assembled_feat)
+        else:
+            return assembled_feat  # (N_prop, N_clips, D_concat) or (N_clips, D_concat)
+
+
+class ProposalRetrievalEvalDataset(Dataset):
+    """
+    init_data_mode: `video_query` or `video_only` or `query_only`,
+        it indicates which data to load when initialize the Dataset object.
+    data_mode: `context` or `query`, it indicates which data to return for self.__get_item__()
+    desc_bert_path_or_handler: h5py.File object or str path
+    vid_feat_path_or_handler: h5py.File object or str path
+    eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with
+        max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals.
+    load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval.
+    data_ratio: percentage of query data to use.
+    """
+    def __init__(self, dset_name, eval_split_name, data_path=None,
+                 desc_bert_path_or_handler=None, max_desc_len=None,
+                 sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
+                 corpus_path=None, clip_length=None,
+                 eval_proposal_bsz=None, ctx_mode="tef", data_mode="context",
+                 h5driver=None, data_ratio=1.0, normalize_vfeat=True,
+                 normalize_tfeat=True, max_n_proposals=90, model_type="cal"):
+        self.dset_name = dset_name
+        self.model_type = model_type
+        self.pool_local = model_type == "mcn"  # pool local feature
+        self.eval_split_name = eval_split_name
+        self.ctx_mode = ctx_mode
+        self.load_gt_video = False
+        self.data_ratio = data_ratio  # only affect query data
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+        self.max_n_proposals = max_n_proposals
+
+        self.data_mode = None
+        self.set_data_mode(data_mode)
+
+        self.max_desc_len = max_desc_len
+        self.data_path = data_path
+        self.query_data = load_jsonl(data_path)
+        if data_ratio != 1:
+            n_examples = int(len(self.query_data) * data_ratio)
+            self.query_data = self.query_data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+
+        video_data = load_json(corpus_path)[self.eval_split_name]
+        self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()]
+        self.video2idx = {k: v[1] for k, v in video_data.items()}
+        self.eval_proposal_bsz = eval_proposal_bsz
+        self.clip_length = clip_length
+        self.proposal_fn = get_proposal_interface(dset_name)
+
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+
+    def set_data_mode(self, data_mode):
+        """context or query"""
+        assert data_mode in ["context", "query"]
+        self.data_mode = data_mode
+
+    def load_gt_vid_name_for_query(self, load_gt_video):
+        """load_gt_video: bool, affect the returned value of self._get_item_query"""
+        assert "vid_name" in self.query_data[0]
+        self.load_gt_video = load_gt_video
+
+    def __len__(self):
+        if self.data_mode == "context":
+            return len(self.video_data)
+        else:
+            return len(self.query_data)
+
+    def __getitem__(self, index):
+        if self.data_mode == "context":
+            return self._get_item_context(index)
+        else:
+            return self._get_item_query(index)
+
+    def _get_item_query(self, index):
+        """Need to batch"""
+        raw_data = self.query_data[index]
+
+        meta = dict(
+            desc_id=raw_data["desc_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"] if self.load_gt_video else None
+        )
+
+        model_inputs = dict()
+        query_feat = self.desc_bert_h5[str(raw_data["desc_id"])][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        model_inputs["query_feat"] = torch.from_numpy(query_feat)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def _get_item_context(self, index):
+        """No need to batch, since it has already been batched here"""
+        raw_data = self.video_data[index]
+
+        # get proposals and sort in ascending order, to get more efficient batching
+        proposals = self.proposal_fn(
+            video_id="", metadata={"duration": raw_data["duration"]})  # np.ndarray (N_p, 2)
+        proposals_lengths = proposals[:, 1] - proposals[:, 0]  # seconds
+        sorted_proposal_indices = np.argsort(proposals_lengths)[:self.max_n_proposals]
+        sorted_proposals = proposals[sorted_proposal_indices]
+
+        # initialize with basic data
+        meta = dict(
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+            proposals=sorted_proposals
+        )
+        model_inputs = dict()
+
+        n_proposal_batches = math.ceil(1.0 * len(sorted_proposals) / self.eval_proposal_bsz)
+
+        tef_batched_list = [None, ] * n_proposal_batches
+        t_moments_mask_list = [None, ] * n_proposal_batches
+        if self.use_tef:
+            tef_array = sorted_proposals / meta["duration"]  # (N_p, 2)
+            for batch_idx in range(n_proposal_batches):
+                st_m_idx = batch_idx * self.eval_proposal_bsz
+                ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz
+                tef_batched_list[batch_idx] = tef_array[st_m_idx:ed_m_idx]
+                t_moments_mask_list[batch_idx] = \
+                    np.ones((len(tef_batched_list[batch_idx]), 1), dtype=np.float32)
+            if not self.use_video and not self.use_sub:  # use video stream
+                model_inputs["video_moment_features_list"] = [
+                    ProposalRetrievalDataset.concat_feat_adv(tef=t, ctx_mode=self.ctx_mode) for t in tef_batched_list]
+                model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in t_moments_mask_list]
+
+        # extract/group/pad
+        if self.use_video:
+            v_feat = self.vid_feat_h5[meta["vid_name"]]  # (N_frm, D)
+            v_ctx_feat = np.mean(v_feat, axis=0)  # (D, )
+            if self.normalize_vfeat:
+                v_ctx_feat = l2_normalize_np_array(v_ctx_feat)
+            v_padded_moments_features_list, v_moments_mask_list = \
+                self.get_batched_moment_feat_for_all_proposals(v_feat, sorted_proposals,
+                                                               pool_local=self.pool_local,
+                                                               normalize=self.normalize_vfeat)
+
+            model_inputs["video_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv(
+                moment_feats=[v, v_ctx_feat], tef=t, ctx_mode=self.ctx_mode)
+                for v, t in zip(v_padded_moments_features_list, tef_batched_list)]
+            model_inputs["video_moment_mask_list"] = [torch.from_numpy(e) for e in v_moments_mask_list]
+
+        if self.use_sub:
+            s_feat = self.sub_bert_h5[meta["vid_name"]]  # (N_frm, D)
+            s_ctx_feat = np.mean(s_feat, axis=0)  # (D, )
+            if self.normalize_tfeat:
+                s_ctx_feat = l2_normalize_np_array(s_ctx_feat)
+            s_padded_moments_features_list, s_moments_mask_list = \
+                self.get_batched_moment_feat_for_all_proposals(s_feat, sorted_proposals,
+                                                               pool_local=self.pool_local,
+                                                               normalize=self.normalize_tfeat)
+            model_inputs["sub_moment_features_list"] = [ProposalRetrievalDataset.concat_feat_adv(
+                moment_feats=[s, s_ctx_feat], tef=t, ctx_mode=self.ctx_mode)
+                for s, t in zip(s_padded_moments_features_list, tef_batched_list)]
+            model_inputs["sub_moment_mask_list"] = [torch.from_numpy(e) for e in s_moments_mask_list]
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def get_batched_moment_feat_for_all_proposals(self, feature, moments, pool_local=False, normalize=True):
+        """proposals of the same video wil be segmented into multiple batches to accomodate GPU memory
+        pool_local: pool local feature into a single vector
+        """
+        n_proposal_batches = math.ceil(1.0 * len(moments) / self.eval_proposal_bsz)
+        padded_moments_features_list = [None, ] * n_proposal_batches
+        moments_mask_list = [None, ] * n_proposal_batches
+        moments_features = self.get_moment_feat_for_all_proposals(
+            feature, moments, normalize=normalize, pool_local=pool_local)  # N_p * [(N_clips, D), ]
+        for batch_idx in range(n_proposal_batches):
+            st_m_idx = batch_idx * self.eval_proposal_bsz
+            ed_m_idx = (batch_idx + 1) * self.eval_proposal_bsz
+            padded_moments_features, moments_mask = \
+                pad_sequences_1d(moments_features[st_m_idx:ed_m_idx], dtype=np.float32)
+            padded_moments_features_list[batch_idx] = padded_moments_features
+            moments_mask_list[batch_idx] = moments_mask
+            assert np.sum(np.sum(moments_mask, axis=1) == 0) == 0, " err {}".format(moments_mask)
+        assert np.sum(np.sum(moments_mask_list[0], axis=1) == 0) == 0, " err {}".format(moments_mask_list)
+        return padded_moments_features_list, moments_mask_list
+
+    def get_moment_feat_for_all_proposals(self, vid_feat, moments, normalize=True, pool_local=False):
+        """Each moment is comprised of multiple clips
+        Args:
+            vid_feat: np.ndarray, (N_clips, D)
+            moments: np.ndarray, (N_p, 2), each row is [st (float), ed (float)],
+            normalize: L2 normalize
+            pool_local:
+        Returns:
+            moments_features: list(np.ndarray), [(N_clips, D), ] * N_p, N_clips is changing.
+        """
+        if normalize and not pool_local:
+            vid_feat = l2_normalize_np_array(vid_feat)
+        vid_feat_len = len(vid_feat)
+        moments_st_clip_indices = np.floor(moments[:, 0] / self.clip_length).astype(np.int64).clip(0, vid_feat_len-1)
+        moments_ed_clip_indices = np.ceil(moments[:, 1] / self.clip_length).astype(np.int64).clip(1, vid_feat_len)
+        moments_features = []
+        for st_idx, ed_idx, m in zip(moments_st_clip_indices, moments_ed_clip_indices, moments):
+            feat = vid_feat[st_idx:ed_idx]
+            if pool_local:
+                feat = np.mean(feat, axis=0, keepdims=True)
+                if normalize:
+                    feat = l2_normalize_np_array(feat)
+            moments_features.append(feat)
+        return moments_features
+
+
+def proposal_retrieval_collate(batch):
+    batch_meta = [e["meta"] for e in batch]  # seems no need to collate ?
+
+    model_inputs_keys = batch[0]["model_inputs"].keys()
+    batched_data = {k: pad_sequences_1d([e["model_inputs"][k] for e in batch], dtype=torch.float32)
+                    for k in model_inputs_keys}
+    return batch_meta, batched_data
+
+
+def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False):
+    model_inputs = {}
+    for k, v in batched_model_inputs.items():
+        model_inputs[k] = v[0].to(device, non_blocking=non_blocking)
+        model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking)
+    return model_inputs
+
+
+if __name__ == '__main__':
+    from baselines.clip_alignment_with_language.config import BaseOptions
+    options = BaseOptions().parse()
diff --git a/baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh b/baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh
new file mode 100644
index 0000000000000000000000000000000000000000..975130a0af44ee087d74baef9e89b43313a8aa9b
--- /dev/null
+++ b/baselines/clip_alignment_with_language/scripts/compute_upper_bound.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# run at project root dir
+dset_name=$1  # see case below
+split_name=$2  # train/val/test, some datasets may not support all the 3 splits
+result_dir="baselines/clip_alignment_with_language/results"
+
+echo "Running with dataset ${dset_name} with split ${split_name}"
+case ${dset_name} in
+    tvr)  # only supports train/val
+        eval_file_path=data/tvr_${split_name}_release.jsonl
+        save_path=${result_dir}/tvr_${split_name}_proposal_upper_bound.json
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+
+echo "Running evaluation"
+python baselines/clip_alignment_with_language/local_utils/compute_proposal_upper_bound.py \
+-dset_name=${dset_name} \
+-eval_file_path=${eval_file_path} \
+-save_path=${save_path} \
+-verbose
diff --git a/baselines/clip_alignment_with_language/scripts/inference.sh b/baselines/clip_alignment_with_language/scripts/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3a8fd1fe8d325f53b2fcd5f5c7b550848f82af0c
--- /dev/null
+++ b/baselines/clip_alignment_with_language/scripts/inference.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+model_dir=$1
+eval_split_name=$2
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+tasks=(VR)
+tasks+=(SVMR)
+tasks+=(VCMR)
+echo "tasks ${tasks[@]}"
+python baselines/clip_alignment_with_language/inference.py \
+--model_dir ${model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+${@:3}
diff --git a/baselines/clip_alignment_with_language/scripts/inference_mix.sh b/baselines/clip_alignment_with_language/scripts/inference_mix.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3503911fecdfd3a5f2708a42041928ff94034d0b
--- /dev/null
+++ b/baselines/clip_alignment_with_language/scripts/inference_mix.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/inference_mix.sh
+eval_model=$1  # [mcn, cal], retrain models should only be paired with mee
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval/baselines/clip_alignment_with_language/results
+
+# setup eval model
+if [[ ${eval_model} == mcn ]]; then
+    pred_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40
+    tef_pred_dir=tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57
+elif [[ ${eval_model} == cal ]]; then
+    pred_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59
+    tef_pred_dir=tvr-cal-video_sub_tef-res-2019_11_05_14_25_49
+fi
+
+pred_path=${project_root}/${pred_dir}/inference_tvr_test_public_max200_predictions_VR_SVMR_VCMR.json
+save_path=${project_root}/${pred_dir}/inference_tvr_test_public_max200_predictions_VR_SVMR_VCMR_rerank_${tef_pred_dir}.json
+tef_pred_path=${project_root}/${tef_pred_dir}/inference_tvr_test_public_max10000_predictions_VCMR.pt
+gt_path=data/tvr_test_public_archive.jsonl
+
+
+python baselines/clip_alignment_with_language/mix_model_prediction.py \
+--pred_path=${pred_path} \
+--tef_pred_path=${tef_pred_path} \
+--gt_path=${gt_path} \
+--save_path=${save_path}
diff --git a/baselines/clip_alignment_with_language/scripts/inference_with_external.sh b/baselines/clip_alignment_with_language/scripts/inference_with_external.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7695a98cac4c5cd7bd6c04edd7a07e6ba2449055
--- /dev/null
+++ b/baselines/clip_alignment_with_language/scripts/inference_with_external.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/inference_with_external.sh
+#model_dir=$1
+# DO not use NMS, since it gives worse results
+eval_model=$1  # [mcn, mcn_tef, cal, cal_tef, mcn_retrain, cal_retrain], retrain models should only be paired with mee
+external_model=$2  # [mee, mcn, cal]
+eval_split_name=$3
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval/baselines
+
+# setup eval model
+if [[ ${eval_model} == mcn ]]; then
+    eval_model_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40
+elif [[ ${eval_model} == mcn_tef ]]; then
+    eval_model_dir=tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57
+elif [[ ${eval_model} == cal ]]; then
+    eval_model_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59
+elif [[ ${eval_model} == cal_tef ]]; then
+    eval_model_dir=tvr-cal-video_sub_tef-res-2019_11_05_14_25_49
+elif [[ ${eval_model} == mcn_tef_retrain ]]; then
+    eval_model_dir=tvr-mcn-video_sub_tef-+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57+-2019_11_06_02_26_49
+elif [[ ${eval_model} == cal_tef_retrain ]]; then
+    eval_model_dir=tvr-cal-video_sub_tef-+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-cal-video_sub_tef-res-2019_11_05_14_25_49+-2019_11_06_03_12_15
+fi
+
+# setup external
+if [[ ${external_model} == mee ]]; then
+    external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39
+    external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json
+elif [[ ${external_model} == mcn ]]; then
+    external_model_dir=tvr-mcn-video_sub-res-2019_11_05_14_16_40
+    external_inference_vr_res_path=${project_root}/clip_alignment_with_language/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR_SVMR_VCMR.json
+elif [[ ${external_model} == cal ]]; then
+    external_model_dir=tvr-cal-video_sub-res-2019_11_05_14_32_59
+    external_inference_vr_res_path=${project_root}/clip_alignment_with_language/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR_SVMR_VCMR.json
+fi
+
+tasks=(VR)
+tasks+=(SVMR)
+tasks+=(VCMR)
+echo "tasks ${tasks[@]}"
+python baselines/clip_alignment_with_language/inference.py \
+--model_dir ${eval_model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+--external_inference_vr_res_path ${external_inference_vr_res_path} \
+--eval_id ${external_model_dir} \
+${@:4}
+
+#--use_intermediate \  # temporary removed
+
diff --git a/baselines/clip_alignment_with_language/scripts/re_train_cal.sh b/baselines/clip_alignment_with_language/scripts/re_train_cal.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f436b5dbf82243e80ac3d1e036b9b3403230ab70
--- /dev/null
+++ b/baselines/clip_alignment_with_language/scripts/re_train_cal.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+lr=0.00005
+n_epoch=20
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval
+ckpt_filename="model.ckpt"
+init_ckpt_path=${project_root}/baselines/clip_alignment_with_language/results/tvr-cal-video_sub_tef-res-2019_11_05_14_25_49/${ckpt_filename}
+exp_id=+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-cal-video_sub_tef-res-2019_11_05_14_25_49+
+external_train_vr_res_path=${project_root}/baselines/mixture_embedding_experts/results/tvr-video_sub-res-2019_11_06_00_33_39/inference_tvr_train_None_predictions_VR.json
+model_type=cal
+
+bash baselines/clip_alignment_with_language/scripts/train.sh tvr video_sub_tef resnet_i3d \
+--no_norm_vfeat \
+--model_type ${model_type} \
+--exp_id ${exp_id} \
+--init_ckpt_path ${init_ckpt_path} \
+--external_train_vr_res_path ${external_train_vr_res_path} \
+--lr ${lr} \
+--n_epoch ${n_epoch} \
+--max_es_cnt 5 \
+${@:1}
diff --git a/baselines/clip_alignment_with_language/scripts/re_train_mcn.sh b/baselines/clip_alignment_with_language/scripts/re_train_mcn.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3bb3302e95edc06a3b24661d24b6a780ce58d81d
--- /dev/null
+++ b/baselines/clip_alignment_with_language/scripts/re_train_mcn.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+lr=0.00005
+n_epoch=20
+project_root=/net/bvisionserver14/playpen-ssd/jielei/projects/video_retrieval
+ckpt_filename="model.ckpt"
+init_ckpt_path=${project_root}/baselines/clip_alignment_with_language/results/tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57/${ckpt_filename}
+exp_id=+ex_vr_mee_tvr-video_sub-res-2019_11_06_00_33_39_tvr-mcn-video_sub_tef-res-2019_11_05_14_14_57+
+external_train_vr_res_path=${project_root}/baselines/mixture_embedding_experts/results/tvr-video_sub-res-2019_11_06_00_33_39/inference_tvr_train_None_predictions_VR.json
+model_type=mcn
+
+bash baselines/clip_alignment_with_language/scripts/train.sh tvr video_sub_tef resnet_i3d \
+--no_norm_vfeat \
+--model_type ${model_type} \
+--exp_id ${exp_id} \
+--init_ckpt_path ${init_ckpt_path} \
+--external_train_vr_res_path ${external_train_vr_res_path} \
+--lr ${lr} \
+--n_epoch ${n_epoch} \
+--max_es_cnt 5 \
+${@:1}
diff --git a/baselines/clip_alignment_with_language/scripts/train.sh b/baselines/clip_alignment_with_language/scripts/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..58f1b4e0a5164835c6c4901b61a02c8da6a3d9a4
--- /dev/null
+++ b/baselines/clip_alignment_with_language/scripts/train.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS
+# if re-training, please also give --init_ckpt_path and --external_train_vr_res_path, may also use lower lr ?
+dset_name=$1  # see case below
+ctx_mode=$2  # ["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"]
+vid_feat_type=$3  # [resnet, i3d, resnet_i3d, none] , none for subtitles only models
+feature_root=data/tvr_feature_release
+results_root=baselines/clip_alignment_with_language/results
+vid_feat_size=2048
+extra_args=()
+
+if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+    if [[ ${dset_name} != "tvr" ]]; then
+        echo "The use of subtitles is only supported in tvr."
+        exit 1
+    fi
+fi
+
+
+case ${dset_name} in
+    tvr)
+        train_path=data/tvr_train_release.jsonl
+        corpus_path=data/tvr_video2dur_idx.json
+        desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5
+        vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+        clip_length=1.5
+        eval_split_name=val
+        nms_thd=-1
+        extra_args+=(--eval_path)
+        extra_args+=(data/tvr_val_release.jsonl)
+
+        if [[ ${vid_feat_type} == "i3d" ]]; then
+            echo "Using I3D feature with shape 1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5
+            vid_feat_size=1024
+        elif [[ ${vid_feat_type} == "resnet" ]]; then
+            echo "Using ResNet feature with shape 2048"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+            vid_feat_size=2048
+        elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then
+            echo "Using concatenated ResNet and I3D feature with shape 2048+1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5
+            vid_feat_size=3072
+            extra_args+=(--no_norm_vfeat)  # since they are already normalized.
+        fi
+
+        if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+            echo "Running with sub."
+            desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5  # overwrite
+            sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5
+            sub_feat_size=768
+            extra_args+=(--sub_feat_size)
+            extra_args+=(${sub_feat_size})
+            extra_args+=(--sub_bert_path)
+            extra_args+=(${sub_bert_path})
+        fi
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+
+echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]"
+echo "Extra args ${extra_args[@]}"
+python baselines/clip_alignment_with_language/train.py \
+--dset_name=${dset_name} \
+--eval_split_name=${eval_split_name} \
+--nms_thd=${nms_thd} \
+--results_root=${results_root} \
+--train_path=${train_path} \
+--desc_bert_path=${desc_bert_path} \
+--corpus_path=${corpus_path} \
+--vid_feat_path=${vid_feat_path} \
+--clip_length=${clip_length} \
+--vid_feat_size=${vid_feat_size} \
+--ctx_mode=${ctx_mode} \
+${extra_args[@]} \
+${@:4}
diff --git a/baselines/clip_alignment_with_language/train.py b/baselines/clip_alignment_with_language/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..62eff7b4d07cd19583c03d88602a0e96c7f64be9
--- /dev/null
+++ b/baselines/clip_alignment_with_language/train.py
@@ -0,0 +1,310 @@
+import os
+import time
+import json
+import pprint
+import random
+import numpy as np
+from collections import OrderedDict
+from easydict import EasyDict as EDict
+from tqdm import tqdm, trange
+
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+from baselines.clip_alignment_with_language.config import BaseOptions
+from baselines.clip_alignment_with_language.model import CALWithSub
+from baselines.clip_alignment_with_language.proposal_retrieval_dataset import \
+    ProposalRetrievalDataset, proposal_retrieval_collate, ProposalRetrievalEvalDataset, prepare_batch_inputs
+from baselines.clip_alignment_with_language.inference import eval_epoch, start_inference
+from utils.basic_utils import save_jsonl, save_json, AverageMeter
+from utils.model_utils import count_parameters
+
+
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def set_seed(seed, use_cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if use_cuda:
+        torch.cuda.manual_seed_all(seed)
+
+
+def train_epoch(model, train_loader, optimizer, opt, epoch_i):
+    model.train()
+
+    # init meters
+    dataloading_time = AverageMeter()
+    prepare_inputs_time = AverageMeter()
+    model_forward_time = AverageMeter()
+    model_backward_time = AverageMeter()
+    loss_meter = AverageMeter()
+
+    num_training_examples = len(train_loader)
+    timer_dataloading = time.time()
+    for batch_idx, batch in tqdm(enumerate(train_loader),
+                                 desc="Training Iteration",
+                                 total=num_training_examples):
+        dataloading_time.update(time.time() - timer_dataloading)
+
+        # continue
+        timer_start = time.time()
+        model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory)
+        prepare_inputs_time.update(time.time() - timer_start)
+        # logger.info("model_inputs {}"
+        #             .format({k: (type(k), v.shape if isinstance(v, torch.Tensor) else v)
+        #                      for k, v in model_inputs.items()}))
+        # logger.info("model_inputs \n{}".format({k: (type(v), v.shape, v.dtype) for k, v in model_inputs.items()}))
+        timer_start = time.time()
+        loss = model(**model_inputs)
+        model_forward_time.update(time.time() - timer_start)
+        timer_start = time.time()
+        optimizer.zero_grad()
+        loss.backward()
+        if opt.grad_clip != -1:
+            nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
+        optimizer.step()
+        model_backward_time.update(time.time() - timer_start)
+
+        global_step = epoch_i * num_training_examples + batch_idx
+        opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step)
+        opt.writer.add_scalar("Train/Loss", float(loss), global_step)
+        loss_meter.update(float(loss))
+
+        timer_dataloading = time.time()
+        if opt.debug and batch_idx == 3:
+            break
+    to_write = opt.train_log_txt_formatter.format(
+        time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+        epoch=epoch_i,
+        loss_str=str(loss_meter.avg))
+    with open(opt.train_log_filepath, "a") as f:
+        f.write(to_write)
+    print("Epoch time stats:")
+    print("dataloading_time: max {dataloading_time.max} "
+          "min {dataloading_time.min} avg {dataloading_time.avg}\n"
+          "prepare_inputs_time: max {prepare_inputs_time.max} "
+          "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n"
+          "model_forward_time: max {model_forward_time.max} "
+          "min {model_forward_time.min} avg {model_forward_time.avg}\n"
+          "model_backward_time: max {model_backward_time.max} "
+          "min {model_backward_time.min} avg {model_backward_time.avg}\n"
+          "".format(dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time,
+                    model_forward_time=model_forward_time, model_backward_time=model_backward_time))
+
+
+def train(model, train_dataset, val_dataset, opt):
+    # Prepare optimizer
+    optimizer = torch.optim.SGD(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=opt.lr,
+        weight_decay=opt.wd,
+        momentum=opt.momentum)
+    # reduce the lr by 0.1 every 30 epochs
+    scheduler = torch.optim.lr_scheduler.StepLR(
+        optimizer,
+        step_size=30,
+        gamma=0.1
+    )
+
+    train_loader = DataLoader(train_dataset,
+                              collate_fn=proposal_retrieval_collate,
+                              batch_size=opt.bsz,
+                              num_workers=opt.num_workers,
+                              shuffle=True,
+                              pin_memory=opt.pin_memory)
+
+    prev_best_score = 0.
+    es_cnt = 0
+    start_epoch = -1 if opt.eval_untrained else 0
+    eval_tasks_at_training = ["SVMR", ]
+    save_submission_filename = \
+        "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, "_".join(eval_tasks_at_training))
+    for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"):
+        if epoch_i > -1:
+            with torch.autograd.detect_anomaly():
+                train_epoch(model, train_loader, optimizer, opt, epoch_i)
+        global_step = (epoch_i + 1) * len(train_loader)
+        scheduler.step()
+        if opt.eval_path is not None:
+            with torch.no_grad():
+                metrics_no_nms, metrics_nms, latest_file_paths = \
+                    eval_epoch(model, val_dataset, opt, save_submission_filename, tasks=eval_tasks_at_training,
+                               max_before_nms=300, max_after_nms=100)
+            logger.info("metrics_no_nms {}".format(
+                pprint.pformat(rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4)))
+            logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+            to_write = opt.eval_log_txt_formatter.format(
+                time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+                epoch=epoch_i,
+                eval_metrics_str=json.dumps(metrics_no_nms))
+            with open(opt.eval_log_filepath, "a") as f:
+                f.write(to_write)
+
+            # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms
+            metrics = metrics_no_nms
+            # early stop/ log / save model
+            for task_type, task_metrics in metrics.items():
+                for iou_thd in [0.5, 0.7]:
+                    opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd),
+                                           {k: v for k, v in task_metrics.items() if str(iou_thd) in k},
+                                           global_step)
+
+            # use the most strict metric available
+            if metrics["SVMR"]["0.5-r1"] > prev_best_score:
+                es_cnt = 0
+                prev_best_score = metrics["SVMR"]["0.5-r1"]
+
+                checkpoint = {
+                    "model": model.state_dict(),
+                    "model_cfg": model.config,
+                    "epoch": epoch_i}
+                torch.save(checkpoint, opt.ckpt_filepath)
+
+                best_file_paths = [e.replace("latest", "best") for e in latest_file_paths]
+                for src, tgt in zip(latest_file_paths, best_file_paths):
+                    os.renames(src, tgt)
+                logger.info("The checkpoint file has been updated.")
+            else:
+                es_cnt += 1
+                if es_cnt > opt.max_es_cnt:  # early stop
+                    with open(opt.train_log_filepath, "a") as f:
+                        f.write("Early Stop at epoch {}".format(epoch_i))
+                    logger.info("Early stop at {} with SVMR 0.5-r1 {}".format(epoch_i, prev_best_score))
+                    break
+        else:
+            checkpoint = {
+                "model": model.state_dict(),
+                "model_cfg": model.config,
+                "epoch": epoch_i}
+            torch.save(checkpoint, opt.ckpt_filepath)
+
+        if opt.debug:
+            break
+
+    opt.writer.close()
+
+
+def rm_key_from_odict(odict_obj, rm_suffix):
+    """remove key entry from the OrderedDict"""
+    return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k])
+
+
+def start_training():
+    logger.info("Setup config, data and model...")
+    opt = BaseOptions().parse()
+    set_seed(opt.seed)
+    if opt.debug:  # keep the model run deterministically
+        # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
+        # Enable this only when input size is fixed.
+        cudnn.benchmark = False
+        cudnn.deterministic = True
+
+    opt.writer = SummaryWriter(opt.tensorboard_log_dir)
+    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
+    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"
+
+    train_dataset = ProposalRetrievalDataset(
+        dset_name=opt.dset_name,
+        model_type=opt.model_type,
+        data_path=opt.train_path,
+        desc_bert_path=opt.desc_bert_path,
+        sub_bert_path=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        vid_feat_path=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        vid_feat_size=opt.vid_feat_size,
+        sub_feat_size=opt.sub_feat_size,
+        ctx_mode=opt.ctx_mode,
+        pos_iou_thd=opt.pos_iou_thd,
+        neg_iou_thd=opt.neg_iou_thd,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+        external_train_vr_res_path=opt.external_train_vr_res_path,  # If not None, used to guide negative sampling
+        corpus_path=opt.corpus_path,
+    )
+
+    if opt.eval_path is not None:
+        eval_dataset = ProposalRetrievalEvalDataset(
+            dset_name=opt.dset_name,
+            model_type=opt.model_type,
+            eval_split_name=opt.eval_split_name,  # should only be val set
+            data_path=opt.eval_path,
+            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
+            sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
+            max_desc_len=opt.max_desc_l,
+            corpus_path=opt.corpus_path,
+            vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
+            clip_length=opt.clip_length,
+            eval_proposal_bsz=opt.eval_proposal_bsz,
+            ctx_mode=opt.ctx_mode,
+            data_mode="query",
+            h5driver=opt.h5driver,
+            data_ratio=opt.data_ratio,
+            normalize_vfeat=not opt.no_norm_vfeat,
+            normalize_tfeat=not opt.no_norm_tfeat,
+        )
+    else:
+        eval_dataset = None
+
+    model_config = EDict(
+        visual_input_size=train_dataset.vid_feat_output_size,  # changes based on visual input type
+        textual_input_size=train_dataset.sub_feat_output_size,
+        query_feat_size=opt.desc_feat_size,
+        visual_hidden_size=opt.visual_hidden_size,  #
+        output_size=opt.output_size,
+        embedding_size=opt.embedding_size,
+        lstm_hidden_size=opt.lstm_hidden_size,
+        margin=opt.margin,  # margin for ranking loss
+        loss_type=opt.loss_type,  # loss type, 'hinge' or 'lse'
+        inter_loss_weight=opt.inter_loss_weight * (opt.ctx_mode == "tef"),  # weight for inter negatives
+        ctx_mode=opt.ctx_mode
+    )
+    logger.info("model_config {}".format(model_config))
+
+    model = CALWithSub(model_config)
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+
+    if opt.init_ckpt_path is not None:
+        checkpoint = torch.load(opt.init_ckpt_path)
+        model.load_state_dict(checkpoint["model"])
+        logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                    .format(checkpoint["epoch"], opt.init_ckpt_path))
+    count_parameters(model)
+
+    logger.info("Start Training...")
+    train(model, train_dataset, eval_dataset, opt)
+    return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
+
+
+if __name__ == '__main__':
+    model_dir, eval_split_name, eval_path, debug = start_training()
+    if not debug:
+        model_dir = model_dir.split(os.sep)[-1]
+        tasks = ["SVMR", "VCMR"]
+        input_args = ["--model_dir", model_dir,
+                      "--eval_split_name", eval_split_name,
+                      "--eval_path", eval_path,
+                      "--tasks"] + tasks
+
+        import sys
+        sys.argv[1:] = input_args
+        logger.info("\n\n\nFINISHED TRAINING!!!")
+        logger.info("Evaluating model in {}".format(model_dir))
+        start_inference()
diff --git a/baselines/crossmodal_moment_localization/README.md b/baselines/crossmodal_moment_localization/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b874d87a02e9fe36f94990f40ffce3ce88179f02
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/README.md
@@ -0,0 +1,2 @@
+Cross-modal Moment Localization (XML)
+===
diff --git a/baselines/crossmodal_moment_localization/__init__.py b/baselines/crossmodal_moment_localization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ba4cafc7d04c6b82486c7a119c33630d2ad83a4
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/__init__.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..482ff866e0d76733807545a9d58b54c9daba0f93
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/config.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b27b1c7e1415b2358016a7c51a473e7907cb9acd
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/inference.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..489798cd6c70dc06d0b027322c8fb85d447fdccb
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/model_components.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9b751ffbbf627a048418128389bae734c57a7a2
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/model_xml.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc02db4509d07b3b24453b3100b0eae1d786dde7
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/ndcg_iou_topk.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70ad01e2dc536a454283db9b0f2569ba53470534
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/optimization.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc b/baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5be5518699f22351ceaed33a41bf824f606ab5e1
Binary files /dev/null and b/baselines/crossmodal_moment_localization/__pycache__/start_end_dataset.cpython-311.pyc differ
diff --git a/baselines/crossmodal_moment_localization/config.py b/baselines/crossmodal_moment_localization/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a514d0b3837d11235d911904bf13594edd296c
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/config.py
@@ -0,0 +1,276 @@
+import os
+import time
+import torch
+import argparse
+
+from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
+from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs
+
+
+class BaseOptions(object):
+    saved_option_filename = "opt.json"
+    ckpt_filename = "model.ckpt"
+    tensorboard_log_dir = "tensorboard_log"
+    train_log_filename = "train.log.txt"
+    eval_log_filename = "eval.log.txt"
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        self.initialized = False
+        self.opt = None
+
+    def initialize(self):
+        self.initialized = True
+        self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
+        self.parser.add_argument("--model_name", type=str)
+        self.parser.add_argument("--eval_split_name", type=str, default="val",
+                                 help="should match keys in corpus_path, must set for VCMR")
+        self.parser.add_argument("--debug", action="store_true",
+                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
+        self.parser.add_argument("--data_ratio", type=float, default=1.0,
+                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
+                                      "Use small portion for debug purposes. Note this is different from --debug, "
+                                      "which works by breaking the loops, typically they are not used together.")
+        self.parser.add_argument("--results_root", type=str, default="results")
+        self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
+        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
+        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
+        self.parser.add_argument("--num_workers", type=int, default=4,
+                                 help="num subprocesses used to load the data, 0: use main process")
+        self.parser.add_argument("--no_core_driver", action="store_true",
+                                 help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
+        self.parser.add_argument("--no_pin_memory", action="store_true",
+                                 help="Don't use pin_memory=True for dataloader. "
+                                      "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")
+
+        # training config
+        self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+        self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01,
+                                 help="Proportion of training to perform linear learning rate warmup for. "
+                                      "E.g., 0.1 = 10% of training.")
+        self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
+        self.parser.add_argument("--n_epoch", type=int, default=100, help="number of epochs to run")
+        self.parser.add_argument("--max_es_cnt", type=int, default=10,
+                                 help="number of epochs to early stop, use -1 to disable early stop")
+        self.parser.add_argument("--stop_task", type=str, default="VCMR", choices=["VCMR", "SVMR", "VR"],
+                                 help="Use metric associated with stop_task for early stop")
+        self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+",
+                                 default=["VCMR"], choices=["VCMR", "SVMR", "VR"],
+                                 help="evaluate and report  numbers for tasks specified here.")
+        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+        self.parser.add_argument("--eval_query_bsz", type=int, default=50,
+                                 help="mini-batch size at inference, for query")
+        self.parser.add_argument("--eval_context_bsz", type=int, default=200,
+                                 help="mini-batch size at inference, for video/sub")
+        self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
+        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
+        self.parser.add_argument("--margin", type=float, default=0.1, help="margin for   hinge loss")
+        self.parser.add_argument("--lw_neg_q", type=float, default=1,
+                                 help="weight for ranking loss with negative query and positive context")
+        self.parser.add_argument("--lw_neg_ctx", type=float, default=1,
+                                 help="weight for ranking loss with positive query and negative context")
+        self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss")
+        self.parser.add_argument("--train_span_start_epoch", type=int, default=0,
+                                 help="which epoch to start training span prediction, -1 to disable")
+        self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"],
+                                 help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
+        self.parser.add_argument("--hard_negtiave_start_epoch", type=int, default=20,
+                                 help="which epoch to start hard negative sampling for video-level ranking loss,"
+                                      "use -1 to disable")
+        self.parser.add_argument("--hard_pool_size", type=int, default=20,
+                                 help="hard negatives are still sampled, but from a harder pool.")
+
+        # Model and Data config
+        self.parser.add_argument("--max_sub_l", type=int, default=50,
+                                 help="max length of all sub sentence 97.71 under 50 for 3 sentences")
+        self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
+        self.parser.add_argument("--max_ctx_l", type=int, default=100,
+                                 help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
+
+        self.parser.add_argument("--train_path", type=str, default=None)
+        self.parser.add_argument("--val_path", type=str, default=None)
+        self.parser.add_argument("--test_path", type=str, default=None)
+        self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None,
+                                 help="if set, use external video retrieval results to guide evaluation. ")
+        self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features")
+        self.parser.add_argument("--word2idx_path", type=str,
+                                 help="a dict, {word: word_idx, ...}, "
+                                      "special tokens are {<pad>: 0, <unk>: 1, <eos>: 2}")
+        self.parser.add_argument("--vocab_size", type=int, default=-1,
+                                 help="Set automatically to len(word2idx)")
+        self.parser.add_argument("--glove_path", type=str,
+                                 help="path to file containing the GloVe embeddings for words in word2idx")
+        self.parser.add_argument("--desc_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--ctx_mode", type=str, choices=["video", "sub", "video_sub", "tef",
+                                                                  "video_tef", "sub_tef", "video_sub_tef"],
+                                 help="which context to use. a combination of [video, sub, tef]")
+        self.parser.add_argument("--corpus_path", type=str, default=None)
+        self.parser.add_argument("--vid_feat_path", type=str, default="")
+        self.parser.add_argument("--no_norm_vfeat", action="store_true",
+                                 help="Do not do normalization on video feat, use it only when using resnet_i3d feat")
+        self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
+        self.parser.add_argument("--clip_length", type=float, default=None,
+                                 help="each video will be uniformly segmented into small clips, "
+                                      "will automatically loaded from ProposalConfigs if None")
+        self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
+
+        self.parser.add_argument("--span_predictor_type", type=str, default="conv", choices=["conv", "cat_linear"],
+                                 help="how to generate span predictions, "
+                                      "conv: apply 1D-Conv layer on top of NxL dot product of query and clips"
+                                      "cat_linear: cat the query and clips then use a linear layer to give output. "
+                                      "Note cat_linear is implemented as first project query and clips into scores, "
+                                      "separately, then sum them up, this should be similar to first cat then project.")
+        self.parser.add_argument("--stack_conv_predictor_conv_kernel_sizes", type=int, default=-1, nargs="+",
+                                 help="combine the results from conv edge detectors of all sizes specified."
+                                      "-1: disable. If specified, will ignore --conv_kernel_size option."
+                                      "This flag is only used when --merge_two_stream and --span_predictor_type conv!")
+        self.parser.add_argument("--encoder_type", type=str, default="transformer",
+                                 choices=["gru", "lstm", "transformer", "cnn"])
+        self.parser.add_argument("--add_pe_rnn", action="store_true",
+                                 help="Add positional encoding for GRU and LSTM encoder as well")
+        self.parser.add_argument("--no_merge_two_stream", action="store_true", help="do not merge video and subtitles")
+        self.parser.add_argument("--no_cross_att", action="store_true",
+                                 help="Use cross-attention for modeling video and subtitles")
+        self.parser.add_argument("--no_self_att", action="store_true", help="do not use self attention")
+        self.parser.add_argument("--no_modular", action="store_true", help="do not use modular attention")
+        self.parser.add_argument("--pe_type", type=str, default="cosine", choices=["none", "linear", "cosine"],
+                                 help="Only for query encoding")
+        self.parser.add_argument("--max_position_embeddings", type=int, default=300)
+        self.parser.add_argument("--hidden_size", type=int, default=256)
+        self.parser.add_argument("--n_heads", type=int, default=4)
+        self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs")
+        self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers")
+        self.parser.add_argument("--cross_att_drop", type=float, default=0.1, help="Applied to cross-att")
+        self.parser.add_argument("--conv_kernel_size", type=int, default=5)
+        self.parser.add_argument("--conv_stride", type=int, default=1)
+        self.parser.add_argument("--initializer_range", type=float, default=0.02,
+                                 help="initializer range for linear layer")
+        self.parser.add_argument("--eval_num_per_epoch", type=float)
+
+        # post processing
+        self.parser.add_argument("--min_pred_l", type=int, default=2,
+                                 help="constrain the [st, ed] with ed - st >= 2"
+                                      "(2 clips with length 1.5 each, 3 secs in total"
+                                      "this is the min length for proposal-based method)")
+        self.parser.add_argument("--max_pred_l", type=int, default=16,
+                                 help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total"
+                                      "(16 clips with length 1.5 each, "
+                                      "this is the max length for proposal-based method)")
+        self.parser.add_argument("--q2c_alpha", type=float, default=20,
+                                 help="give more importance to top scored videos' spans,  "
+                                      "the new score will be: s_new = exp(alpha * s), "
+                                      "higher alpha indicates more importance. Note s in [-1, 1]")
+
+        self.parser.add_argument("--max_before_nms", type=int, default=200)
+        self.parser.add_argument("--max_vcmr_video", type=int, default=100,
+                                 help="re-ranking in top-max_vcmr_video")
+        self.parser.add_argument("--nms_thd", type=float, default=-1,
+                                 help="additionally use non-maximum suppression "
+                                      "(or non-minimum suppression for distance)"
+                                      "to post-processing the predictions. "
+                                      "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,")
+
+    def display_save(self, opt):
+        args = vars(opt)
+        # Display settings
+        print("------------ Options -------------\n{}\n-------------------"
+              .format({str(k): str(v) for k, v in sorted(args.items())}))
+
+        # Save settings
+        if not isinstance(self, TestOptions):
+            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
+            save_json(args, option_file_path, save_pretty=True)
+
+    def parse(self):
+        if not self.initialized:
+            self.initialize()
+        opt = self.parser.parse_args()
+
+        if opt.debug:
+            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
+            opt.no_core_driver = True
+            opt.num_workers = 0
+            opt.eval_query_bsz = 100
+
+        if isinstance(self, TestOptions):
+            # modify model_dir to absolute path
+            opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
+            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
+            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
+                if arg not in ["results_root", "num_workers", "nms_thd", "debug",
+                               "eval_split_name", "eval_path", "eval_query_bsz", "eval_context_bsz",
+                               "max_pred_l", "min_pred_l", "external_inference_vr_res_path"]:
+                    setattr(opt, arg, saved_options[arg])
+            # opt.no_core_driver = True
+        else:
+            if opt.exp_id is None:
+                raise ValueError("--exp_id is required for at a training option!")
+
+            if opt.clip_length is None:
+                opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
+                print("Loaded clip_length {} from proposal config file".format(opt.clip_length))
+            opt.results_dir = os.path.join(opt.results_root, "_".join([opt.model_name, opt.exp_id, time.strftime("%Y%m%d_%H%M%S")]))
+            mkdirp(opt.results_dir)
+            # save a copy of current code
+            code_dir = os.path.dirname(os.path.realpath(__file__))
+            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
+            make_zipfile(code_dir, code_zip_filename,
+                         enclosing_dir="code",
+                         exclude_dirs_substring="results",
+                         exclude_dirs=["results", "debug_results", "__pycache__"],
+                         exclude_extensions=[".pyc", ".ipynb", ".swap"],)
+
+        self.display_save(opt)
+
+        if "sub" in opt.ctx_mode:
+            assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
+
+        if opt.hard_negtiave_start_epoch != -1:
+            if opt.hard_pool_size > opt.bsz:
+                print("[WARNING] hard_pool_size is larger than bsz")
+
+        assert opt.stop_task in opt.eval_tasks_at_training
+        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
+        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
+        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
+        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
+        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
+        opt.h5driver = None if opt.no_core_driver else "core"
+        # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
+        opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
+        opt.pin_memory = not opt.no_pin_memory
+
+        if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
+            assert opt.no_norm_vfeat
+
+        if "tef" in opt.ctx_mode and "video" in opt.ctx_mode:
+            opt.vid_feat_size += 2
+        if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode:
+            opt.sub_feat_size += 2
+
+        if "video" not in opt.ctx_mode or "sub" not in opt.ctx_mode:
+            opt.no_merge_two_stream = True
+            opt.no_cross_att = True
+
+        self.opt = opt
+        return opt
+
+
+class TestOptions(BaseOptions):
+    """add additional options for evaluating"""
+    def initialize(self):
+        BaseOptions.initialize(self)
+        # also need to specify --eval_split_name
+        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
+        self.parser.add_argument("--model_dir", type=str,
+                                 help="dir contains the model file, will be converted to absolute path afterwards")
+        self.parser.add_argument("--tasks", type=str, nargs="+",
+                                 choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"],
+                                 help="Which tasks to run."
+                                      "VCMR: Video Corpus Moment Retrieval;"
+                                      "SVMR: Single Video Moment Retrieval;"
+                                      "VR: regular Video Retrieval. (will be performed automatically with VCMR)")
diff --git a/baselines/crossmodal_moment_localization/inference.py b/baselines/crossmodal_moment_localization/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5446f664c33332a00836aad17f285b0b5d9a2b9f
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/inference.py
@@ -0,0 +1,414 @@
+import os
+import copy
+import math
+import time
+import pprint
+from tqdm import tqdm, trange
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+
+from baselines.crossmodal_moment_localization.config import TestOptions
+from baselines.crossmodal_moment_localization.model_xml import XML
+from baselines.crossmodal_moment_localization.start_end_dataset import \
+    start_end_collate, StartEndEvalDataset, prepare_batch_inputs
+from baselines.clip_alignment_with_language.inference import \
+    get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms
+from utils.basic_utils import save_json, load_json
+from utils.tensor_utils import find_max_triples_from_upper_triangle_product
+from standalone_eval.eval import eval_retrieval
+
+import logging
+from ndcg_iou_topk import calculate_ndcg_iou
+
+
+
+
+def compute_context_info(model, eval_dataset, opt):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated 2200 (videos) * 100 (frm) * 500 (hsz) * 4 (B) * 2 (video/sub) * 2 (layers) / (1024 ** 2) = 1.76 GB
+    max_n_videos: only consider max_n_videos videos for each query to return st_ed scores.
+    """
+    model.eval()
+    # eval_dataset.set_data_mode("context")
+    context_dataloader = DataLoader(eval_dataset,
+                                    collate_fn=start_end_collate,
+                                    batch_size=opt.eval_context_bsz,
+                                    num_workers=opt.num_workers,
+                                    shuffle=False,
+                                    pin_memory=opt.pin_memory)
+
+    metas = []  # list(dicts)
+    video_feat1 = []
+    video_feat2 = []
+    video_mask = []
+    sub_feat1 = []
+    sub_feat2 = []
+    sub_mask = []
+    for idx, batch in tqdm(enumerate(context_dataloader),
+                           desc="Computing query2video scores",
+                           total=len(context_dataloader)):
+        metas.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+
+        _video_feat1, _video_feat2, _sub_feat1, _sub_feat2 = model.encode_context(
+            model_inputs["video_feat"], model_inputs["video_mask"],
+            model_inputs["sub_feat"], model_inputs["sub_mask"])
+        if "video" in opt.ctx_mode:
+            video_feat1.append(_video_feat1)
+            video_feat2.append(_video_feat2)
+            video_mask.append(model_inputs["video_mask"])
+        if "sub" in opt.ctx_mode:
+            sub_feat1.append(_sub_feat1)
+            sub_feat2.append(_sub_feat2)
+            sub_mask.append(model_inputs["sub_mask"])
+
+    def cat_tensor(tensor_list):
+        if len(tensor_list) == 0:
+            return None
+        else:
+            seq_l = [e.shape[1] for e in tensor_list]
+            b_sizes = [e.shape[0] for e in tensor_list]
+            b_sizes_cumsum = np.cumsum([0] + b_sizes)
+            if len(tensor_list[0].shape) == 3:
+                hsz = tensor_list[0].shape[2]
+                res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l), hsz)
+            elif len(tensor_list[0].shape) == 2:
+                res_tensor = tensor_list[0].new_zeros(sum(b_sizes), max(seq_l))
+            else:
+                raise ValueError("Only support 2/3 dimensional tensors")
+            for i, e in enumerate(tensor_list):
+                res_tensor[b_sizes_cumsum[i]:b_sizes_cumsum[i+1], :seq_l[i]] = e
+            return res_tensor
+
+    return metas, dict(
+        video_feat1=cat_tensor(video_feat1),  # (N_videos, L, hsz),
+        video_feat2=cat_tensor(video_feat2),
+        video_mask=cat_tensor(video_mask),  # (N_videos, L)
+        sub_feat1=cat_tensor(sub_feat1),
+        sub_feat2=cat_tensor(sub_feat2),
+        sub_mask=cat_tensor(sub_mask),
+    )
+
+
+def index_if_not_none(input_tensor, indices):
+    if input_tensor is None:
+        return input_tensor
+    else:
+        return input_tensor[indices]
+
+
+
+
+def generate_min_max_length_mask(array_shape, min_l, max_l):
+    """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked,
+    below is the case for 4x4.
+    [[0, 1, 1, 0],
+     [0, 0, 1, 1],
+     [0, 0, 0, 1],
+     [0, 0, 0, 0]]
+
+    Args:
+        array_shape: np.shape??? The last two dimensions should be the same
+        min_l: int, minimum length of predicted span
+        max_l: int, maximum length of predicted span
+
+    Returns:
+
+    """
+    single_dims = (1, ) * (len(array_shape) - 2)
+    mask_shape = single_dims + array_shape[-2:]
+    extra_length_mask_array = np.ones(mask_shape, dtype=np.float32)  # (1, ..., 1, L, L)
+    mask_triu = np.triu(extra_length_mask_array, k=min_l)
+    mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l)
+    final_prob_mask = mask_triu * mask_triu_reversed
+    return final_prob_mask  # with valid bit to be 1
+
+
+def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx,
+                                  clip_length, min_pred_l, max_pred_l, max_before_nms):
+    """
+    Args:
+        svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1]
+        svmr_gt_ed_probs:
+        query_metas:
+        video2idx:
+        clip_length: float, how long each clip is in seconds
+        min_pred_l: int, minimum number of clips
+        max_pred_l: int, maximum number of clips
+        max_before_nms: get top-max_before_nms predictions for each query
+
+    Returns:
+
+    """
+    svmr_res = []
+    query_vid_names = [e["vid_name"] for e in query_metas]
+
+    # masking very long ones! Since most are relatively short.
+    st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs)  # (N, L, L)
+    # extra_length_mask_array = np.ones(st_ed_prob_product.shape, dtype=bool)  # (N, L, L)
+    # mask_triu = np.triu(extra_length_mask_array, k=min_pred_l)
+    # mask_triu_reversed = np.logical_not(np.triu(extra_length_mask_array, k=max_pred_l))
+    # final_prob_mask = np.logical_and(mask_triu, mask_triu_reversed)  # with valid bit to be 1
+    valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l)
+    st_ed_prob_product *= valid_prob_mask  # invalid location will become zero!
+
+    batched_sorted_triples = find_max_triples_from_upper_triangle_product(
+        st_ed_prob_product, top_n=max_before_nms, prob_thd=None)
+    for i, q_vid_name in tqdm(enumerate(query_vid_names),
+                              desc="[SVMR] Loop over queries to generate predictions",
+                              total=len(query_vid_names)):  # i is query_id
+        q_m = query_metas[i]
+        video_idx = video2idx[q_vid_name]
+        _sorted_triples = batched_sorted_triples[i]
+        _sorted_triples[:, 1] += 1  # as we redefined ed_idx, which is inside the moment.
+        _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()]
+        cur_query_pred = dict(
+            query_id=q_m["query_id"],
+            desc=q_m["desc"],
+            predictions=cur_ranked_predictions
+        )
+        svmr_res.append(cur_query_pred)
+    return svmr_res
+
+
+def load_external_vr_res2(external_vr_res_path, top_n_vr_videos=5):
+    """return a mapping from query_id to top retrieved video info"""
+    external_vr_res = load_json(external_vr_res_path)
+    external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"]
+    query2video = {e["query_id"]: e["predictions"] for e in external_vr_res}
+    return query2video
+
+
+def compute_query2ctx_info(model, eval_dataset, opt, video_metas, ctx_info,
+                           max_before_nms=1000, max_n_videos=100, maxtopk=40):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB
+    max_n_videos: int, use max_n_videos videos for computing VCMR/VR results
+    """
+
+    video2idx = eval_dataset.video2idx
+    # video_metas = ctx_info["video_metas"]
+    if opt.external_inference_vr_res_path is not None:
+        video_idx2meta_idx = {video2idx[m["vid_name"]]: i for i, m in enumerate(video_metas)}
+        external_query2video = \
+            load_external_vr_res2(opt.external_inference_vr_res_path, top_n_vr_videos=max_n_videos)
+        # 「query idx： [video meta idx]」
+        external_query2video_meta_idx = \
+            {k: [video_idx2meta_idx[e[0]] for e in v] for k, v in external_query2video.items()}
+    else:
+        external_query2video = None
+        external_query2video_meta_idx = None
+
+    model.eval()
+    eval_dataset.set_data_mode("query")
+    # eval_dataset.load_gt_vid_name_for_query(is_svmr)
+    query_eval_loader = DataLoader(eval_dataset,
+                                   collate_fn=start_end_collate,
+                                   batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers,
+                                   shuffle=False,
+                                   pin_memory=opt.pin_memory)
+    n_total_videos = len(video_metas)
+    n_total_query = len(eval_dataset)
+    bsz = opt.eval_query_bsz
+
+    flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int)
+    flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32)
+    sorted_q2c_indices = np.empty((n_total_query, max_n_videos), dtype=int)
+    sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32)
+
+
+    query_metas = []
+    for idx, batch in tqdm(
+            enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)):
+        _query_metas = batch[0]
+        query_metas.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        # query_context_scores (_N_q, N_videos), st_prob, ed_prob (_N_q, N_videos, L)
+        _query_context_scores, _st_probs, _ed_probs = \
+            model.get_pred_from_raw_query(model_inputs["query_feat"], model_inputs["query_mask"],
+                                          ctx_info["video_feat1"], ctx_info["video_feat2"],
+                                          ctx_info["video_mask"],
+                                          ctx_info["sub_feat1"], ctx_info["sub_feat2"],
+                                          ctx_info["sub_mask"],
+                                          cross=True)
+        # _query_context_scores = _query_context_scores + 1  # move cosine similarity to [0, 2]
+        # To give more importance to top scores, the higher opt.alpha is the more importance will be given
+        _query_context_scores = torch.exp(opt.q2c_alpha * _query_context_scores)
+
+        # normalize to get true probabilities!!!
+        # the probabilities here are already (pad) masked, so only need to do softmax
+        _st_probs = F.softmax(_st_probs, dim=-1)  # (_N_q, N_videos, L)
+        _ed_probs = F.softmax(_ed_probs, dim=-1)
+
+        if external_query2video is None:
+            _sorted_q2c_scores, _sorted_q2c_indices = \
+                torch.topk(_query_context_scores, max_n_videos, dim=1, largest=True)
+        else:
+            relevant_video_info = [external_query2video[qm["query_id"]] for qm in _query_metas]
+            _sorted_q2c_indices = _query_context_scores.new(
+                [[video_idx2meta_idx[sub_e[0]] for sub_e in e] for e in relevant_video_info]).long()
+            _sorted_q2c_scores = _query_context_scores.new(
+                [[sub_e[3] for sub_e in e] for e in relevant_video_info])
+            _sorted_q2c_scores = torch.exp(opt.q2c_alpha * _sorted_q2c_scores)
+        # collect data for vr and vcmr
+        sorted_q2c_indices[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_indices.cpu().numpy()
+        sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_scores.cpu().numpy()
+
+
+        # Get VCMR results
+        # compute combined scores
+        row_indices = torch.arange(0, len(_st_probs), device=opt.device).unsqueeze(1)
+        _st_probs = _st_probs[row_indices, _sorted_q2c_indices]  # (_N_q, max_n_videos, L)
+        _ed_probs = _ed_probs[row_indices, _sorted_q2c_indices]
+
+        # (_N_q, max_n_videos, L, L)
+        _st_ed_scores = torch.einsum("qvm,qv,qvn->qvmn", _st_probs, _sorted_q2c_scores, _ed_probs)
+        valid_prob_mask = generate_min_max_length_mask(
+            _st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l)
+        _st_ed_scores *= torch.from_numpy(
+            valid_prob_mask).to(_st_ed_scores.device)  # invalid location will become zero!
+
+        # sort across the top-max_n_videos videos (by flatten from the 2nd dim)
+        # the indices here are local indices, not global indices
+        _n_q = _st_ed_scores.shape[0]
+        _flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1)  # (N_q, max_n_videos*L*L)
+        _flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \
+            torch.sort(_flat_st_ed_scores, dim=1, descending=True)
+        # collect data
+        flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_sorted_scores[:, :max_before_nms].cpu().numpy()
+        flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_scores_sorted_indices[:, :max_before_nms].cpu().numpy()
+
+        if opt.debug:
+            break
+
+
+    vcmr_res = {}
+    for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm(
+            enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)),
+            desc="[VCMR] Loop over queries to generate predictions", total=n_total_query):  # i is query_idx
+        # list([video_idx(int), st(float), ed(float), score(float)])
+        video_meta_indices_local, pred_st_indices, pred_ed_indices = \
+            np.unravel_index(_flat_st_ed_scores_sorted_indices,
+                                shape=(max_n_videos, opt.max_ctx_l, opt.max_ctx_l))
+        # video_meta_indices_local refers to the indices among the top-max_n_videos
+        # video_meta_indices refers to the indices in all the videos, which is the True indices
+        video_meta_indices = sorted_q2c_indices[i, video_meta_indices_local]
+
+        pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length
+        pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length
+        cur_vcmr_redictions = []
+        for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices, _flat_st_ed_sorted_scores)):  # videos
+            video_idx = video2idx[video_metas[v_meta_idx]["vid_name"]]
+            cur_vcmr_redictions.append(
+                    {
+                    "video_name": video_metas[v_meta_idx]["vid_name"],
+                    "timestamp": [float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j])],
+                    "model_scores": float(v_score)
+                }
+            )
+        query_id=query_metas[i]["query_id"]
+        vcmr_res[query_id] = cur_vcmr_redictions[:maxtopk]
+    return vcmr_res
+
+
+def get_eval_res(model,  eval_dataset, context_data, opt, maxtopk):
+    """compute and save query and video proposal embeddings"""
+    
+    video_metas, context_info = compute_context_info(model, context_data, opt)
+    eval_res = compute_query2ctx_info(model, eval_dataset, opt, video_metas, context_info, 
+                                    max_before_nms=opt.max_before_nms, max_n_videos=opt.max_vcmr_video, maxtopk=maxtopk)
+    return eval_res
+
+
+POST_PROCESSING_MMS_FUNC = {
+    "SVMR": post_processing_svmr_nms,
+    "VCMR": post_processing_vcmr_nms
+}
+
+# def get_prediction_top_n(list_dict_predictions, top_n):
+#     top_n_res = []
+#     for e in list_dict_predictions:
+#         e["predictions"] = e["predictions"][:top_n]
+#         top_n_res.append(e)
+#     return top_n_res
+
+
+def eval_epoch(model, eval_dataset, context_data, logger, opt, max_after_nms, iou_thds, topks):
+    """max_after_nms: always set to 100, since the eval script only evaluate top-100"""
+    # IOU_THDS = (0.3, 0.5, 0.7)
+    
+    model.eval()
+    pred_data = get_eval_res(model, eval_dataset, context_data, opt, max(topks))
+    # pred_data = get_prediction_top_n(eval_res, top_n=max_after_nms)
+    gt_data = eval_dataset.ground_truth
+    average_ndcg = calculate_ndcg_iou(gt_data, pred_data, iou_thds, topks)
+    return average_ndcg, pred_data
+
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    loaded_model_cfg = checkpoint["model_cfg"]
+    loaded_model_cfg["stack_conv_predictor_conv_kernel_sizes"] = -1
+    model = XML(loaded_model_cfg)
+    model.load_state_dict(checkpoint["model"])
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+
+
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+    assert opt.eval_path is not None
+    eval_dataset = StartEndEvalDataset(
+        dset_name=opt.dset_name,
+        eval_split_name=opt.eval_split_name,  # should only be val set
+        data_path=opt.eval_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        corpus_path=opt.corpus_path,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        data_mode="query",
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat
+    )
+
+    model = setup_model(opt)
+    save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format(
+        opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks))
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename,
+                       tasks=opt.tasks, max_after_nms=100)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+
+if __name__ == '__main__':
+    start_inference()
diff --git a/baselines/crossmodal_moment_localization/model_components.py b/baselines/crossmodal_moment_localization/model_components.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ab6ba7d99e105c489089877a1f5ef7d630a5f41
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/model_components.py
@@ -0,0 +1,317 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DepthwiseSeparableConv(nn.Module):
+    """
+    Depth-wise separable convolution uses less parameters to generate output by convolution.
+    :Examples:
+        >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1)
+        >>> input_tensor = torch.randn(32, 300, 20)
+        >>> output = m(input_tensor)
+    """
+
+    def __init__(self, in_ch, out_ch, k, dim=1, relu=True):
+        """
+        :param in_ch: input hidden dimension size
+        :param out_ch: output hidden dimension size
+        :param k: kernel size
+        :param dim: default 1. 1D conv or 2D conv
+        """
+        super(DepthwiseSeparableConv, self).__init__()
+        self.relu = relu
+        if dim == 1:
+            self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch,
+                                            kernel_size=k, groups=in_ch, padding=k//2)
+            self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch,
+                                            kernel_size=1, padding=0)
+        elif dim == 2:
+            self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch,
+                                            kernel_size=k, groups=in_ch, padding=k//2)
+            self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch,
+                                            kernel_size=1, padding=0)
+        else:
+            raise Exception("Incorrect dimension!")
+
+    def forward(self, x):
+        """
+        :Input: (N, L_in, D)
+        :Output: (N, L_out, D)
+        """
+        x = x.transpose(1, 2)
+        if self.relu:
+            out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True)
+        else:
+            out = self.pointwise_conv(self.depthwise_conv(x))
+        return out.transpose(1, 2)  # (N, L, D)
+
+
+class ConvEncoder(nn.Module):
+    def __init__(self, kernel_size=7, n_filters=128, dropout=0.1):
+        super(ConvEncoder, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(n_filters)
+        self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True)
+
+    def forward(self, x, mask):
+        """
+        :param x: (N, L, D)
+        :param mask: (N, L), is not used.
+        :return: (N, L, D)
+        """
+        return self.layer_norm(self.dropout(self.conv(x)) + x)  # (N, L, D)
+
+
+class TrainablePositionalEncoding(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, max_position_embeddings, hidden_size, dropout=0.1):
+        super(TrainablePositionalEncoding, self).__init__()
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input_feat):
+        """
+        Args:
+            input_feat: (N, L, D)
+        """
+        bsz, seq_length = input_feat.shape[:2]
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device)
+        position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
+
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = self.LayerNorm(input_feat + position_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class PositionEncoding(nn.Module):
+    """
+    Add positional information to input tensor.
+    :Examples:
+        >>> model = PositionEncoding(n_filters=6, max_len=10)
+        >>> test_input1 = torch.zeros(3, 10, 6)
+        >>> output1 = model(test_input1)
+        >>> output1.size()
+        >>> test_input2 = torch.zeros(5, 3, 9, 6)
+        >>> output2 = model(test_input2)
+        >>> output2.size()
+    """
+
+    def __init__(self, n_filters=128, max_len=500, pe_type="cosine"):
+        """
+        :param n_filters: same with input hidden size
+        :param max_len: maximum sequence length
+        :param pe_type: cosine or linear or None
+        """
+        super(PositionEncoding, self).__init__()
+        self.pe_type = pe_type
+        if pe_type != "none":
+            position = torch.arange(0, max_len).float().unsqueeze(1)
+            if pe_type == "cosine":
+                # Compute the positional encodings once in log space.
+                pe = torch.zeros(max_len, n_filters)  # (L, D)
+                div_term = torch.exp(torch.arange(0, n_filters, 2).float() * - (math.log(10000.0) / n_filters))
+                pe[:, 0::2] = torch.sin(position * div_term)
+                pe[:, 1::2] = torch.cos(position * div_term)
+            elif pe_type == "linear":
+                pe = position / max_len
+            else:
+                raise ValueError
+            self.register_buffer("pe", pe)  # buffer is a tensor, not a variable, (L, D)
+
+    def forward(self, x):
+        """
+        :Input: (*, L, D)
+        :Output: (*, L, D) the same size as input
+        """
+        if self.pe_type != "none":
+            pe = self.pe.data[:x.size(-2), :]  # (#x.size(-2), n_filters)
+            extra_dim = len(x.size()) - 2
+            for _ in range(extra_dim):
+                pe = pe.unsqueeze(0)
+            x = x + pe
+        return x
+
+
+class LinearLayer(nn.Module):
+    """linear layer configurable with layer normalization, dropout, ReLU."""
+
+    def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True):
+        super(LinearLayer, self).__init__()
+        self.relu = relu
+        self.layer_norm = layer_norm
+        if layer_norm:
+            self.LayerNorm = nn.LayerNorm(in_hsz)
+        layers = [
+            nn.Dropout(dropout),
+            nn.Linear(in_hsz, out_hsz)
+        ]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """(N, L, D)"""
+        if self.layer_norm:
+            x = self.LayerNorm(x)
+        x = self.net(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x  # (N, L, D)
+
+
+bert_config = dict(
+    hidden_size=768,
+    intermediate_size=768,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    num_attention_heads=4,
+)
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, use_self_attention=True):
+        super(BertLayer, self).__init__()
+        self.use_self_attention = use_self_attention
+        if use_self_attention:
+            self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask):
+        """
+        Args:
+            hidden_states:  (N, L, D)
+            attention_mask:  (N, L) with 1 indicate valid, 0 indicates invalid
+        Returns:
+
+        """
+        if self.use_self_attention:
+            attention_output = self.attention(hidden_states, attention_mask)
+        else:
+            attention_output = hidden_states
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        """
+        Args:
+            input_tensor: (N, L, D)
+            attention_mask: (N, L)
+        Returns:
+        """
+        self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Sequential(
+            nn.Linear(config.hidden_size, config.intermediate_size),
+            nn.ReLU(True))
+
+    def forward(self, hidden_states):
+        return self.dense(hidden_states)
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  # (N, L, nh, dh)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)  # (N, nh, L, dh)
+
+    def forward(self, query_states, key_states, value_states, attention_mask):
+        """
+        Args:
+            query_states: (N, Lq, D)
+            key_states: (N, L, D)
+            value_states: (N, L, D)
+            attention_mask: (N, Lq, L)
+        Returns:
+        """
+        # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last)
+        # will be ignored in future computation anyway
+        attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000.  # (N, 1, Lq, L)
+        mixed_query_layer = self.query(query_states)
+        mixed_key_layer = self.key(key_states)
+        mixed_value_layer = self.value(value_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)  # (N, nh, Lq, dh)
+        key_layer = self.transpose_for_scores(mixed_key_layer)  # (N, nh, L, dh)
+        value_layer = self.transpose_for_scores(mixed_value_layer)  # (N, nh, L, dh)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))  # (N, nh, Lq, L)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
diff --git a/baselines/crossmodal_moment_localization/model_xml.py b/baselines/crossmodal_moment_localization/model_xml.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b0fea8ec9841755d7e27d0bab31a2409e2dc981
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/model_xml.py
@@ -0,0 +1,642 @@
+import math
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from easydict import EasyDict as edict
+from baselines.crossmodal_moment_localization.model_components import \
+    BertAttention, PositionEncoding, LinearLayer, BertSelfAttention, TrainablePositionalEncoding, ConvEncoder
+from utils.model_utils import RNNEncoder
+
+base_bert_layer_config = dict(
+    hidden_size=768,
+    intermediate_size=768,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    num_attention_heads=4,
+)
+
+xml_base_config = edict(
+    merge_two_stream=True,  # merge only the scores
+    cross_att=True,  # cross-attention for video and subtitles
+    span_predictor_type="conv",
+    encoder_type="transformer",  # cnn, transformer, lstm, gru
+    add_pe_rnn=False,  # add positional encoding for RNNs, (LSTM and GRU)
+    visual_input_size=2048,  # changes based on visual input type
+    query_input_size=768,
+    sub_input_size=768,
+    hidden_size=500,  #
+    conv_kernel_size=5,  # conv kernel_size for st_ed predictor
+    stack_conv_predictor_conv_kernel_sizes=-1,  # Do not use
+    conv_stride=1,  #
+    max_ctx_l=100,
+    max_desc_l=30,
+    input_drop=0.1,  # dropout for input
+    drop=0.1,  # dropout for other layers
+    n_heads=4,  # self attention heads
+    ctx_mode="video_sub",  # which context are used. 'video', 'sub' or 'video_sub'
+    margin=0.1,  # margin for ranking loss
+    ranking_loss_type="hinge",  # loss type, 'hinge' or 'lse'
+    lw_neg_q=1,  # loss weight for neg. query and pos. context
+    lw_neg_ctx=1,  # loss weight for pos. query and neg. context
+    lw_st_ed=1,  # loss weight for st ed prediction
+    use_hard_negative=False,  # use hard negative at video level, we may change it during training.
+    hard_pool_size=20,
+    use_self_attention=True,
+    no_modular=False,
+    pe_type="none",  # no positional encoding
+    initializer_range=0.02,
+)
+
+
+class XML(nn.Module):
+    def __init__(self, config):
+        super(XML, self).__init__()
+        self.config = config
+        # self.position_embeddings = PositionEncoding(n_filters=config.hidden_size,
+        #                                             max_len=config.max_position_embeddings,
+        #                                             pe_type=config.pe_type)
+        self.query_pos_embed = TrainablePositionalEncoding(
+            max_position_embeddings=config.max_desc_l,
+            hidden_size=config.hidden_size, dropout=config.input_drop)
+        self.ctx_pos_embed = TrainablePositionalEncoding(
+            max_position_embeddings=config.max_ctx_l,
+            hidden_size=config.hidden_size, dropout=config.input_drop)
+        self.query_input_proj = LinearLayer(config.query_input_size,
+                                            config.hidden_size,
+                                            layer_norm=True,
+                                            dropout=config.input_drop,
+                                            relu=True)
+        if config.encoder_type == "transformer":  # self-att encoder
+            self.query_encoder = BertAttention(edict(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.hidden_size,
+                hidden_dropout_prob=config.drop,
+                attention_probs_dropout_prob=config.drop,
+                num_attention_heads=config.n_heads,
+            ))
+        elif config.encoder_type == "cnn":
+            self.query_encoder = ConvEncoder(
+                kernel_size=5,
+                n_filters=config.hidden_size,
+                dropout=config.drop
+            )
+        elif config.encoder_type in ["gru", "lstm"]:
+            self.query_encoder = RNNEncoder(
+                word_embedding_size=config.hidden_size,
+                hidden_size=config.hidden_size // 2,
+                bidirectional=True,
+                n_layers=1,
+                rnn_type=config.encoder_type,
+                return_outputs=True,
+                return_hidden=False
+            )
+
+        conv_cfg = dict(in_channels=1,
+                        out_channels=1,
+                        kernel_size=config.conv_kernel_size,
+                        stride=config.conv_stride,
+                        padding=config.conv_kernel_size // 2,
+                        bias=False)
+
+        cross_att_cfg = edict(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.n_heads,
+            attention_probs_dropout_prob=config.drop
+        )
+
+        self.use_video = "video" in config.ctx_mode
+        if self.use_video:
+            self.video_input_proj = LinearLayer(config.visual_input_size,
+                                                config.hidden_size,
+                                                layer_norm=True,
+                                                dropout=config.input_drop,
+                                                relu=True)
+            self.video_encoder1 = copy.deepcopy(self.query_encoder)
+            self.video_encoder2 = copy.deepcopy(self.query_encoder)
+            if self.config.cross_att:
+                self.video_cross_att = BertSelfAttention(cross_att_cfg)
+                self.video_cross_layernorm = nn.LayerNorm(config.hidden_size)
+            else:
+                if self.config.encoder_type == "transformer":
+                    self.video_encoder3 = copy.deepcopy(self.query_encoder)
+            self.video_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
+            if config.span_predictor_type == "conv":
+                if not config.merge_two_stream:
+                    self.video_st_predictor = nn.Conv1d(**conv_cfg)
+                    self.video_ed_predictor = nn.Conv1d(**conv_cfg)
+            elif config.span_predictor_type == "cat_linear":
+                self.video_st_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+                self.video_ed_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+
+        self.use_sub = "sub" in config.ctx_mode
+        if self.use_sub:
+            self.sub_input_proj = LinearLayer(config.sub_input_size,
+                                              config.hidden_size,
+                                              layer_norm=True,
+                                              dropout=config.input_drop,
+                                              relu=True)
+            self.sub_encoder1 = copy.deepcopy(self.query_encoder)
+            self.sub_encoder2 = copy.deepcopy(self.query_encoder)
+            if self.config.cross_att:
+                self.sub_cross_att = BertSelfAttention(cross_att_cfg)
+                self.sub_cross_layernorm = nn.LayerNorm(config.hidden_size)
+            else:
+                if self.config.encoder_type == "transformer":
+                    self.sub_encoder3 = copy.deepcopy(self.query_encoder)
+            self.sub_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
+            if config.span_predictor_type == "conv":
+                if not config.merge_two_stream:
+                    self.sub_st_predictor = nn.Conv1d(**conv_cfg)
+                    self.sub_ed_predictor = nn.Conv1d(**conv_cfg)
+            elif config.span_predictor_type == "cat_linear":
+                self.sub_st_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+                self.sub_ed_predictor = nn.ModuleList([nn.Linear(config.hidden_size, 1) for _ in range(2)])
+
+        self.modular_vector_mapping = nn.Linear(in_features=config.hidden_size,
+                                                out_features=self.use_sub + self.use_video,
+                                                bias=False)
+
+        self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean")
+
+        if config.merge_two_stream and config.span_predictor_type == "conv":
+            if self.config.stack_conv_predictor_conv_kernel_sizes == -1:
+                self.merged_st_predictor = nn.Conv1d(**conv_cfg)
+                self.merged_ed_predictor = nn.Conv1d(**conv_cfg)
+            else:
+                print("Will be using  multiple Conv layers for prediction.")
+                self.merged_st_predictors = nn.ModuleList()
+                self.merged_ed_predictors = nn.ModuleList()
+                num_convs = len(self.config.stack_conv_predictor_conv_kernel_sizes)
+                for k in self.config.stack_conv_predictor_conv_kernel_sizes:
+                    conv_cfg = dict(in_channels=1,
+                                    out_channels=1,
+                                    kernel_size=k,
+                                    stride=config.conv_stride,
+                                    padding=k // 2,
+                                    bias=False)
+                    self.merged_st_predictors.append(nn.Conv1d(**conv_cfg))
+                    self.merged_ed_predictors.append(nn.Conv1d(**conv_cfg))
+                self.combine_st_conv = nn.Linear(num_convs, 1, bias=False)
+                self.combine_ed_conv = nn.Linear(num_convs, 1, bias=False)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """ Initialize the weights."""
+
+        def re_init(module):
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                # Slightly different from the TF version which uses truncated_normal for initialization
+                # cf https://github.com/pytorch/pytorch/pull/5617
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            elif isinstance(module, nn.Conv1d):
+                module.reset_parameters()
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+
+        self.apply(re_init)
+
+    def set_hard_negative(self, use_hard_negative, hard_pool_size):
+        """use_hard_negative: bool; hard_pool_size: int, """
+        self.config.use_hard_negative = use_hard_negative
+        self.config.hard_pool_size = hard_pool_size
+
+    def set_train_st_ed(self, lw_st_ed):
+        """pre-train video retrieval then span prediction"""
+        self.config.lw_st_ed = lw_st_ed
+
+    def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask,
+                tef_feat, tef_mask, st_ed_indices):
+        """
+        Args:
+            query_feat: (N, Lq, Dq)
+            query_mask: (N, Lq)
+            video_feat: (N, Lv, Dv) or None
+            video_mask: (N, Lv) or None
+            sub_feat: (N, Lv, Ds) or None
+            sub_mask: (N, Lv) or None
+            tef_feat: (N, Lv, 2) or None,
+            tef_mask: (N, Lv) or None,
+            st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively.
+        """
+        video_feat1, video_feat2, sub_feat1, sub_feat2 = \
+            self.encode_context(video_feat, video_mask, sub_feat, sub_mask)
+
+        query_context_scores, st_prob, ed_prob = \
+            self.get_pred_from_raw_query(query_feat, query_mask,
+                                         video_feat1, video_feat2, video_mask,
+                                         sub_feat1, sub_feat2, sub_mask, cross=False)
+
+        loss_st_ed = 0
+        if self.config.lw_st_ed != 0:
+            loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0])
+            loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1])
+            loss_st_ed = loss_st + loss_ed
+
+        loss_neg_ctx, loss_neg_q = 0, 0
+        if self.config.lw_neg_ctx != 0 or self.config.lw_neg_q != 0:
+            loss_neg_ctx, loss_neg_q = self.get_video_level_loss(query_context_scores)
+
+        loss_st_ed = self.config.lw_st_ed * loss_st_ed
+        loss_neg_ctx = self.config.lw_neg_ctx * loss_neg_ctx
+        loss_neg_q = self.config.lw_neg_q * loss_neg_q
+        loss = loss_st_ed + loss_neg_ctx + loss_neg_q
+        return loss, {"loss_st_ed": float(loss_st_ed),
+                      "loss_neg_ctx": float(loss_neg_ctx),
+                      "loss_neg_q": float(loss_neg_q),
+                      "loss_overall": float(loss)}
+
+    def get_visualization_data(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask,
+                               tef_feat, tef_mask, st_ed_indices):
+        assert self.config.merge_two_stream and self.use_video and self.use_sub and not self.config.no_modular
+        video_feat1, video_feat2, sub_feat1, sub_feat2 = \
+            self.encode_context(video_feat, video_mask, sub_feat, sub_mask)
+        encoded_query = self.encode_input(query_feat, query_mask,
+                                          self.query_input_proj, self.query_encoder, self.query_pos_embed)  # (N, Lq, D)
+        # (N, D), (N, D), (N, L, 2)
+        video_query, sub_query, modular_att_scores = \
+            self.get_modularized_queries(encoded_query, query_mask, return_modular_att=True)
+        # (N, L), (N, L), (N, L)
+        st_prob, ed_prob, similarity_scores, video_similarity, sub_similarity = self.get_merged_st_ed_prob(
+            video_query, video_feat2, sub_query, sub_feat2, video_mask, cross=False, return_similaity=True)
+
+        # clean up invalid bits
+        data = dict(modular_att_scores=modular_att_scores.cpu().numpy(),  # (N, Lq, 2), row 0, 1 are video, sub.
+                    st_prob=st_prob.cpu().numpy(),  # (N, L)
+                    ed_prob=ed_prob.cpu().numpy(),  # (N, L)
+                    similarity_scores=similarity_scores.cpu().numpy(),  # (N, L)
+                    video_similarity=video_similarity.cpu().numpy(),  # (N, L)
+                    sub_similarity=sub_similarity.cpu().numpy(),  # (N, L)
+                    st_ed_indices=st_ed_indices.cpu().numpy())  # (N, L)
+        query_lengths = query_mask.sum(1).to(torch.long).cpu().tolist()  # (N, )
+        ctx_lengths = video_mask.sum(1).to(torch.long).cpu().tolist()  # (N, )
+        # print("query_lengths {}".format((type(query_lengths), len(query_lengths), query_lengths[:10])))
+        for k, v in data.items():
+            if k == "modular_att_scores":
+                # print(k, v, v.shape, type(v))
+                data[k] = [e[:l] for l, e in zip(query_lengths, v)]  # list(e) where e is  (Lq_i, 2)
+            else:
+                data[k] = [e[:l] for l, e in zip(ctx_lengths, v)]   # list(e) where e is (Lc_i)
+
+        # aggregate info for each example
+        datalist = []
+        for idx in range(len(data["modular_att_scores"])):
+            datalist.append({k: v[idx] for k, v in data.items()})
+        return datalist  # list(dicts) of length N
+
+    def encode_query(self, query_feat, query_mask):
+        encoded_query = self.encode_input(query_feat, query_mask,
+                                          self.query_input_proj, self.query_encoder, self.query_pos_embed)  # (N, Lq, D)
+        video_query, sub_query = self.get_modularized_queries(encoded_query, query_mask)  # (N, D) * 2
+        return video_query, sub_query
+
+    def non_cross_encode_context(self, context_feat, context_mask, module_name="video"):
+        encoder_layer3 = getattr(self, module_name + "_encoder3") \
+            if self.config.encoder_type == "transformer" else None
+        return self._non_cross_encode_context(context_feat, context_mask,
+                                              input_proj_layer=getattr(self, module_name + "_input_proj"),
+                                              encoder_layer1=getattr(self, module_name + "_encoder1"),
+                                              encoder_layer2=getattr(self, module_name + "_encoder2"),
+                                              encoder_layer3=encoder_layer3)
+
+    def _non_cross_encode_context(self, context_feat, context_mask, input_proj_layer,
+                                  encoder_layer1, encoder_layer2, encoder_layer3=None):
+        """
+        Args:
+            context_feat: (N, L, D)
+            context_mask: (N, L)
+            input_proj_layer:
+            encoder_layer1:
+            encoder_layer2:
+            encoder_layer3
+        """
+        context_feat1 = self.encode_input(
+            context_feat, context_mask, input_proj_layer, encoder_layer1, self.ctx_pos_embed)  # (N, L, D)
+        if self.config.encoder_type in ["transformer", "cnn"]:
+            context_mask = context_mask.unsqueeze(1)  # (N, 1, L), torch.FloatTensor
+            context_feat2 = encoder_layer2(context_feat1, context_mask)  # (N, L, D)
+            if self.config.encoder_type == "transformer":
+                context_feat2 = encoder_layer3(context_feat2, context_mask)
+        elif self.config.encoder_type in ["gru", "lstm"]:
+            context_mask = context_mask.sum(1).long()  # (N, ), torch.LongTensor
+            context_feat2 = encoder_layer2(context_feat1, context_mask)[0]  # (N, L, D)
+        else:
+            raise NotImplementedError
+        return context_feat1, context_feat2
+
+    def encode_context(self, video_feat, video_mask, sub_feat, sub_mask):
+        if self.config.cross_att:
+            assert self.use_video and self.use_sub
+
+            return self.cross_encode_context(video_feat, video_mask, sub_feat, sub_mask)
+        else:
+            video_feat1, video_feat2 = (None,) * 2
+            if self.use_video:
+                video_feat1, video_feat2 = self.non_cross_encode_context(video_feat, video_mask, module_name="video")
+            sub_feat1, sub_feat2 = (None,) * 2
+            if self.use_sub:
+                sub_feat1, sub_feat2 = self.non_cross_encode_context(sub_feat, sub_mask, module_name="sub")
+            return video_feat1, video_feat2, sub_feat1, sub_feat2
+
+    def cross_encode_context(self, video_feat, video_mask, sub_feat, sub_mask):
+        encoded_video_feat = self.encode_input(video_feat, video_mask,
+                                               self.video_input_proj, self.video_encoder1, self.ctx_pos_embed)
+        encoded_sub_feat = self.encode_input(sub_feat, sub_mask,
+                                             self.sub_input_proj, self.sub_encoder1, self.ctx_pos_embed)
+        x_encoded_video_feat = self.cross_context_encoder(
+            encoded_video_feat, video_mask, encoded_sub_feat, sub_mask,
+            self.video_cross_att, self.video_cross_layernorm, self.video_encoder2)  # (N, L, D)
+        x_encoded_sub_feat = self.cross_context_encoder(
+            encoded_sub_feat, sub_mask, encoded_video_feat, video_mask,
+            self.sub_cross_att, self.sub_cross_layernorm, self.sub_encoder2)  # (N, L, D)
+        return encoded_video_feat, x_encoded_video_feat, encoded_sub_feat, x_encoded_sub_feat
+
+    def cross_context_encoder(self, main_context_feat, main_context_mask, side_context_feat, side_context_mask,
+                              cross_att_layer, norm_layer, self_att_layer):
+        """
+        Args:
+            main_context_feat: (N, Lq, D)
+            main_context_mask: (N, Lq)
+            side_context_feat: (N, Lk, D)
+            side_context_mask: (N, Lk)
+            cross_att_layer:
+            norm_layer:
+            self_att_layer:
+        """
+        cross_mask = torch.einsum("bm,bn->bmn", main_context_mask, side_context_mask)  # (N, Lq, Lk)
+        cross_out = cross_att_layer(main_context_feat, side_context_feat, side_context_feat, cross_mask)  # (N, Lq, D)
+        residual_out = norm_layer(cross_out + main_context_feat)
+        if self.config.encoder_type in ["cnn", "transformer"]:
+            return self_att_layer(residual_out, main_context_mask.unsqueeze(1))
+        elif self.config.encoder_type in ["gru", "lstm"]:
+            return self_att_layer(residual_out, main_context_mask.sum(1).long())[0]
+
+    def encode_input(self, feat, mask, input_proj_layer, encoder_layer, pos_embed_layer):
+        """
+        Args:
+            feat: (N, L, D_input), torch.float32
+            mask: (N, L), torch.float32, with 1 indicates valid query, 0 indicates mask
+            input_proj_layer: down project input
+            encoder_layer: encoder layer
+            # add_pe: bool, whether to add positional encoding
+            pos_embed_layer
+        """
+        feat = input_proj_layer(feat)
+
+        if self.config.encoder_type in ["cnn", "transformer"]:
+            feat = pos_embed_layer(feat)
+            mask = mask.unsqueeze(1)  # (N, 1, L), torch.FloatTensor
+            return encoder_layer(feat, mask)  # (N, L, D_hidden)
+        elif self.config.encoder_type in ["gru", "lstm"]:
+            if self.config.add_pe_rnn:
+                feat = pos_embed_layer(feat)
+            mask = mask.sum(1).long()  # (N, ), torch.LongTensor
+            return encoder_layer(feat, mask)[0]  # (N, L, D_hidden)
+
+    def get_modularized_queries(self, encoded_query, query_mask, return_modular_att=False):
+        """
+        Args:
+            encoded_query: (N, L, D)
+            query_mask: (N, L)
+            return_modular_att: bool
+        """
+        if self.config.no_modular:
+            modular_query = torch.max(mask_logits(encoded_query, query_mask.unsqueeze(2)), dim=1)[0]  # (N, D)
+            return modular_query, modular_query  #
+        else:
+            modular_attention_scores = self.modular_vector_mapping(encoded_query)  # (N, L, 2 or 1)
+            modular_attention_scores = F.softmax(
+                mask_logits(modular_attention_scores, query_mask.unsqueeze(2)), dim=1)
+            # TODO check whether it is the same
+            modular_queries = torch.einsum("blm,bld->bmd",
+                                           modular_attention_scores, encoded_query)  # (N, 2 or 1, D)
+            if return_modular_att:
+                assert modular_queries.shape[1] == 2
+                return modular_queries[:, 0], modular_queries[:, 1], modular_attention_scores
+            else:
+                if modular_queries.shape[1] == 2:
+                    return modular_queries[:, 0], modular_queries[:, 1]  # (N, D) * 2
+                else:  # 1
+                    return modular_queries[:, 0], modular_queries[:, 0]  # the same
+
+    def get_modular_weights(self, encoded_query, query_mask):
+        """
+        Args:
+            encoded_query: (N, L, D)
+            query_mask: (N, L)
+        """
+        max_encoded_query, _ = torch.max(mask_logits(encoded_query, query_mask.unsqueeze(2)), dim=1)  # (N, D)
+        modular_weights = self.modular_weights_calculator(max_encoded_query)  # (N, 2)
+        modular_weights = F.softmax(modular_weights, dim=-1)
+        return modular_weights[:, 0:1], modular_weights[:, 1:2]  # (N, 1) * 2
+
+    def get_video_level_scores(self, modularied_query, context_feat1, context_mask):
+        """ Calculate video2query scores for each pair of video and query inside the batch.
+        Args:
+            modularied_query: (N, D)
+            context_feat1: (N, L, D), output of the first transformer encoder layer
+            context_mask: (N, L)
+        Returns:
+            context_query_scores: (N, N)  score of each query w.r.t. each video inside the batch,
+                diagonal positions are positive. used to get negative samples.
+        """
+        modularied_query = F.normalize(modularied_query, dim=-1)
+        context_feat1 = F.normalize(context_feat1, dim=-1)
+        query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat1)  # (N, L, N)
+        context_mask = context_mask.transpose(0, 1).unsqueeze(0)  # (1, L, N)
+        query_context_scores = mask_logits(query_context_scores, context_mask)  # (N, L, N)
+        query_context_scores, _ = torch.max(query_context_scores,
+                                            dim=1)  # (N, N) diagonal positions are positive pairs.
+        return query_context_scores
+
+    def get_merged_st_ed_prob(self, video_query, video_feat, sub_query, sub_feat, context_mask,
+                              cross=False, return_similaity=False):
+        """context_mask could be either video_mask or sub_mask, since they are the same"""
+        assert self.use_video and self.use_sub and self.config.span_predictor_type == "conv"
+        video_query = self.video_query_linear(video_query)
+        sub_query = self.sub_query_linear(sub_query)
+        stack_conv = self.config.stack_conv_predictor_conv_kernel_sizes != -1
+        num_convs = len(self.config.stack_conv_predictor_conv_kernel_sizes) if stack_conv else None
+        if cross:
+            video_similarity = torch.einsum("md,nld->mnl", video_query, video_feat)
+            sub_similarity = torch.einsum("md,nld->mnl", sub_query, sub_feat)
+            similarity = (video_similarity + sub_similarity) / 2  # (Nq, Nv, L)  from query to all videos.
+            n_q, n_c, l = similarity.shape
+            similarity = similarity.view(n_q * n_c, 1, l)
+            if not stack_conv:
+                st_prob = self.merged_st_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+                ed_prob = self.merged_ed_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+            else:
+                st_prob_list = []
+                ed_prob_list = []
+                for idx in range(num_convs):
+                    st_prob_list.append(self.merged_st_predictors[idx](similarity).squeeze().unsqueeze(2))
+                    ed_prob_list.append(self.merged_ed_predictors[idx](similarity).squeeze().unsqueeze(2))
+                # (Nq*Nv, L, 3) --> (Nq*Nv, L) -> (Nq, Nv, L)
+                st_prob = self.combine_st_conv(torch.cat(st_prob_list, dim=2)).view(n_q, n_c, l)
+                ed_prob = self.combine_ed_conv(torch.cat(ed_prob_list, dim=2)).view(n_q, n_c, l)
+        else:
+            video_similarity = torch.einsum("bd,bld->bl", video_query, video_feat)  # (N, L)
+            sub_similarity = torch.einsum("bd,bld->bl", sub_query, sub_feat)  # (N, L)
+            similarity = (video_similarity + sub_similarity) / 2
+            if not stack_conv:
+                st_prob = self.merged_st_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+                ed_prob = self.merged_ed_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+            else:
+                st_prob_list = []
+                ed_prob_list = []
+                for idx in range(num_convs):
+                    st_prob_list.append(self.merged_st_predictors[idx](similarity.unsqueeze(1)).squeeze().unsqueeze(2))
+                    ed_prob_list.append(self.merged_ed_predictors[idx](similarity.unsqueeze(1)).squeeze().unsqueeze(2))
+                st_prob = self.combine_st_conv(torch.cat(st_prob_list, dim=2)).squeeze()  # (N, L, 3) --> (N, L)
+                ed_prob = self.combine_ed_conv(torch.cat(ed_prob_list, dim=2)).squeeze()  # (N, L, 3) --> (N, L)
+        st_prob = mask_logits(st_prob, context_mask)  # (N, L)
+        ed_prob = mask_logits(ed_prob, context_mask)
+        if return_similaity:
+            assert not cross
+            return st_prob, ed_prob, similarity, video_similarity, sub_similarity
+        else:
+            return st_prob, ed_prob
+
+    def get_st_ed_prob(self, modularied_query, context_feat2, context_mask,
+                       module_name="video", cross=False):
+        return self._get_st_ed_prob(modularied_query, context_feat2, context_mask,
+                                    module_query_linear=getattr(self, module_name + "_query_linear"),
+                                    st_predictor=getattr(self, module_name + "_st_predictor"),
+                                    ed_predictor=getattr(self, module_name + "_ed_predictor"),
+                                    cross=cross)
+
+    def _get_st_ed_prob(self, modularied_query, context_feat2, context_mask,
+                        module_query_linear, st_predictor, ed_predictor, cross=False):
+        """
+        Args:
+            modularied_query: (N, D)
+            context_feat2: (N, L, D), output of the first transformer encoder layer
+            context_mask: (N, L)
+            module_query_linear:
+            st_predictor:
+            ed_predictor:
+            cross: at inference, calculate prob for each possible pairs of query and context.
+        """
+        query = module_query_linear(modularied_query)  # (N, D) no need to normalize here.
+        if cross:
+            if self.config.span_predictor_type == "conv":
+                similarity = torch.einsum("md,nld->mnl", query, context_feat2)  # (Nq, Nv, L)  from query to all videos.
+                n_q, n_c, l = similarity.shape
+                similarity = similarity.view(n_q * n_c, 1, l)
+                st_prob = st_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+                ed_prob = ed_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+            elif self.config.span_predictor_type == "cat_linear":
+                st_prob_q = st_predictor[0](query).unsqueeze(1)  # (Nq, 1, 1)
+                st_prob_ctx = st_predictor[1](context_feat2).squeeze().unsqueeze(0)  # (1, Nv, L)
+                st_prob = st_prob_q + st_prob_ctx  # (Nq, Nv, L)
+                ed_prob_q = ed_predictor[0](query).unsqueeze(1)  # (Nq, 1, 1)
+                ed_prob_ctx = ed_predictor[1](context_feat2).squeeze().unsqueeze(0)  # (1, Nv, L)
+                ed_prob = ed_prob_q + ed_prob_ctx  # (Nq, Nv, L)
+            context_mask = context_mask.unsqueeze(0)  # (1, Nv, L)
+        else:
+            if self.config.span_predictor_type == "conv":
+                similarity = torch.einsum("bd,bld->bl", query, context_feat2)  # (N, L)
+                st_prob = st_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+                ed_prob = ed_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
+            elif self.config.span_predictor_type == "cat_linear":
+                # avoid concatenation by break into smaller matrix multiplications.
+                st_prob = st_predictor[0](query) + st_predictor[1](context_feat2).squeeze()  # (N, L)
+                ed_prob = ed_predictor[0](query) + ed_predictor[1](context_feat2).squeeze()  # (N, L)
+        st_prob = mask_logits(st_prob, context_mask)  # (N, L)
+        ed_prob = mask_logits(ed_prob, context_mask)
+        return st_prob, ed_prob
+
+    def get_pred_from_raw_query(self, query_feat, query_mask,
+                                video_feat1, video_feat2, video_mask,
+                                sub_feat1, sub_feat2, sub_mask, cross=False):
+        """
+        Args:
+            query_feat: (N, Lq, Dq)
+            query_mask: (N, Lq)
+            video_feat1: (N, Lv, D) or None
+            video_feat2:
+            video_mask: (N, Lv)
+            sub_feat1: (N, Lv, D) or None
+            sub_feat2:
+            sub_mask: (N, Lv)
+            cross:
+        """
+        video_query, sub_query = self.encode_query(query_feat, query_mask)
+        divisor = self.use_sub + self.use_video
+
+        # get video-level retrieval scores
+        video_q2ctx_scores = self.get_video_level_scores(video_query, video_feat1, video_mask) if self.use_video else 0
+        sub_q2ctx_scores = self.get_video_level_scores(sub_query, sub_feat1, sub_mask) if self.use_sub else 0
+        q2ctx_scores = (video_q2ctx_scores + sub_q2ctx_scores) / divisor  # (N, N)
+
+        if self.config.merge_two_stream and self.use_video and self.use_sub:
+            st_prob, ed_prob = self.get_merged_st_ed_prob(
+                video_query, video_feat2, sub_query, sub_feat2, video_mask, cross=cross)
+        else:
+            video_st_prob, video_ed_prob = self.get_st_ed_prob(
+                video_query, video_feat2, video_mask, module_name="video", cross=cross) if self.use_video else (0, 0)
+            sub_st_prob, sub_ed_prob = self.get_st_ed_prob(
+                sub_query, sub_feat2, sub_mask, module_name="sub", cross=cross) if self.use_sub else (0, 0)
+            st_prob = (video_st_prob + sub_st_prob) / divisor  # (N, Lv)
+            ed_prob = (video_ed_prob + sub_ed_prob) / divisor  # (N, Lv)
+        return q2ctx_scores, st_prob, ed_prob  # un-normalized masked probabilities!!!!!
+
+    def get_video_level_loss(self, query_context_scores):
+        """ ranking loss between (pos. query + pos. video) and (pos. query + neg. video) or (neg. query + pos. video)
+        Args:
+            query_context_scores: (N, N), cosine similarity [-1, 1],
+                Each row contains the scores between the query to each of the videos inside the batch.
+        """
+        bsz = len(query_context_scores)
+        diagonal_indices = torch.arange(bsz).to(query_context_scores.device)
+        pos_scores = query_context_scores[diagonal_indices, diagonal_indices]  # (N, )
+        query_context_scores_masked = copy.deepcopy(query_context_scores.data)
+        # impossibly large for cosine similarity, the copy is created as modifying the original will cause error
+        query_context_scores_masked[diagonal_indices, diagonal_indices] = 999
+        pos_query_neg_context_scores = self.get_neg_scores(query_context_scores,
+                                                           query_context_scores_masked)
+        neg_query_pos_context_scores = self.get_neg_scores(query_context_scores.transpose(0, 1),
+                                                           query_context_scores_masked.transpose(0, 1))
+        loss_neg_ctx = self.get_ranking_loss(pos_scores, pos_query_neg_context_scores)
+        loss_neg_q = self.get_ranking_loss(pos_scores, neg_query_pos_context_scores)
+        return loss_neg_ctx, loss_neg_q
+
+    def get_neg_scores(self, scores, scores_masked):
+        """
+        scores: (N, N), cosine similarity [-1, 1],
+            Each row are scores: query --> all videos. Transposed version: video --> all queries.
+        scores_masked: (N, N) the same as scores, except that the diagonal (positive) positions
+            are masked with a large value.
+        """
+        bsz = len(scores)
+        batch_indices = torch.arange(bsz).to(scores.device)
+        _, sorted_scores_indices = torch.sort(scores_masked, descending=True, dim=1)
+        sample_min_idx = 1  # skip the masked positive
+        sample_max_idx = min(sample_min_idx + self.config.hard_pool_size, bsz) \
+            if self.config.use_hard_negative else bsz
+        sampled_neg_score_indices = sorted_scores_indices[
+            batch_indices, torch.randint(sample_min_idx, sample_max_idx, size=(bsz,)).to(scores.device)]  # (N, )
+        sampled_neg_scores = scores[batch_indices, sampled_neg_score_indices]  # (N, )
+        return sampled_neg_scores
+
+    def get_ranking_loss(self, pos_score, neg_score):
+        """ Note here we encourage positive scores to be larger than negative scores.
+        Args:
+            pos_score: (N, ), torch.float32
+            neg_score: (N, ), torch.float32
+        """
+        if self.config.ranking_loss_type == "hinge":  # max(0, m + S_neg - S_pos)
+            return torch.clamp(self.config.margin + neg_score - pos_score, min=0).sum() / len(pos_score)
+        elif self.config.ranking_loss_type == "lse":  # log[1 + exp(S_neg - S_pos)]
+            return torch.log1p(torch.exp(neg_score - pos_score)).sum() / len(pos_score)
+        else:
+            raise NotImplementedError("Only support 'hinge' and 'lse'")
+
+
+def mask_logits(target, mask):
+    return target * mask + (1 - mask) * (-1e10)
diff --git a/baselines/crossmodal_moment_localization/ndcg_iou_topk.py b/baselines/crossmodal_moment_localization/ndcg_iou_topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd61093f88290c30c1f7794a16c13477f8296729
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/ndcg_iou_topk.py
@@ -0,0 +1,68 @@
+from utils.basic_utils import load_jsonl, save_jsonl, load_json
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from collections import defaultdict
+import copy  
+
+def calculate_iou(pred_start: float, pred_end: float, gt_start: float, gt_end: float) -> float:
+    intersection_start = max(pred_start, gt_start)
+    intersection_end = min(pred_end, gt_end)
+    intersection = max(0, intersection_end - intersection_start)
+    union = (pred_end - pred_start) + (gt_end - gt_start) - intersection
+    return intersection / union if union > 0 else 0
+
+
+# Function to calculate DCG
+def calculate_dcg(scores):
+    return sum((2**score - 1) / np.log2(idx + 2) for idx, score in enumerate(scores))
+
+# Function to calculate NDCG
+def calculate_ndcg(pred_scores, true_scores):
+    dcg = calculate_dcg(pred_scores)
+    idcg = calculate_dcg(sorted(true_scores, reverse=True))
+    return dcg / idcg if idcg > 0 else 0
+
+
+
+def calculate_ndcg_iou(all_gt, all_pred, TS, KS):
+    performance = defaultdict(lambda: defaultdict(list))
+    performance_avg = defaultdict(lambda: defaultdict(float))
+    for k in tqdm(all_pred.keys(), desc="Calculate NDCG"):
+        one_pred = all_pred[k]
+        one_gt = all_gt[k]  
+
+        one_gt.sort(key=lambda x: x["relevance"], reverse=True)
+        for T in TS:
+            one_gt_drop = copy.deepcopy(one_gt)  
+            predictions_with_scores = []
+            
+            for pred in one_pred:
+                pred_video_name, pred_time = pred["video_name"], pred["timestamp"]
+                matched_rows = [gt for gt in one_gt_drop if gt["video_name"] == pred_video_name]
+                if not matched_rows:
+                    pred["pred_relevance"] = 0
+                else:
+                    ious = [calculate_iou(pred_time[0], pred_time[1], gt["timestamp"][0], gt["timestamp"][1]) for gt in matched_rows]
+                    max_iou_idx = np.argmax(ious)
+                    max_iou_row = matched_rows[max_iou_idx]
+                    
+                    if ious[max_iou_idx] > T:
+                        pred["pred_relevance"] = max_iou_row["relevance"]
+                        # Remove the matched ground truth row
+                        original_idx = one_gt_drop.index(max_iou_row)
+                        one_gt_drop.pop(original_idx)
+                    else:
+                        pred["pred_relevance"] = 0
+                predictions_with_scores.append(pred)
+            for K in KS:
+                true_scores = [gt["relevance"] for gt in one_gt][:K]
+                pred_scores = [pred["pred_relevance"] for pred in predictions_with_scores][:K]
+                ndcg_score = calculate_ndcg(pred_scores, true_scores)
+                performance[K][T].append(ndcg_score)
+    for K, vs in performance.items():
+        for T, v in vs.items():
+            performance_avg[K][T] = np.mean(v)
+    return performance_avg
+
+
diff --git a/baselines/crossmodal_moment_localization/optimization.py b/baselines/crossmodal_moment_localization/optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..985765697f995e0d7821c1b945041b418bbec853
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/optimization.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+import logging
+import abc
+import sys
+
+logger = logging.getLogger(__name__)
+
+
+if sys.version_info >= (3, 4):
+    ABC = abc.ABC
+else:
+    ABC = abc.ABCMeta('ABC', (), {})
+
+
+class _LRSchedule(ABC):
+    """ Parent of all LRSchedules here. """
+    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        """
+        :param warmup:  what fraction of t_total steps will be used for linear warmup
+        :param t_total: how many training steps (updates) are planned
+        :param kw:
+        """
+        super(_LRSchedule, self).__init__(**kw)
+        if t_total < 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        warmup = max(warmup, 0.)
+        self.warmup, self.t_total = float(warmup), float(t_total)
+        self.warned_for_t_total_at_progress = -1
+
+    def get_lr(self, step, nowarn=False):
+        """
+        :param step:    which of t_total steps we're on
+        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+        :return:        learning rate multiplier for current update
+        """
+        if self.t_total < 0:
+            return 1.
+        progress = float(step) / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning(
+                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+                    .format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+
+    @abc.abstractmethod
+    def get_lr_(self, progress):
+        """
+        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+        :return:            learning rate multiplier for current update
+        """
+        return 1.
+
+
+class ConstantLR(_LRSchedule):
+    def get_lr_(self, progress):
+        return 1.
+
+
+class WarmupCosineSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        """
+        :param warmup:      see LRSchedule
+        :param t_total:     see LRSchedule
+        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+        :param kw:
+        """
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+
+
+class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        assert(cycles >= 1.)
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
+            return ret
+
+
+class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+    """
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        assert(warmup * cycles < 1.)
+        warmup = warmup * cycles if warmup >= 0 else warmup
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+
+    def get_lr_(self, progress):
+        progress = progress * self.cycles % 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * progress))
+            return ret
+
+
+class WarmupConstantSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    """
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+
+
+class WarmupLinearSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    """
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0.)
+
+
+SCHEDULES = {
+    None:       ConstantLR,
+    "none":     ConstantLR,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
+}
+
+
+class EMA(object):
+    """ Exponential Moving Average for model parameters.
+    references:
+    [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py
+    [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py"""
+    def __init__(self, decay):
+        self.decay = decay
+        self.shadow = {}
+        self.original = {}
+
+    def register(self, name, val):
+        self.shadow[name] = val.clone()
+
+    def __call__(self, model, step):
+        decay = min(self.decay,  (1 + step) / (10.0 + step))
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                new_average = \
+                    (1.0 - decay) * param.data + decay * self.shadow[name]
+                self.shadow[name] = new_average.clone()
+
+    def assign(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                self.original[name] = param.data.clone()
+                param.data = self.shadow[name]
+
+    def resume(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                param.data = self.original[name]
+
+
+class BertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
+        schedule: schedule to use for the warmup (see above).
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(BertAdam, self).__init__(params, defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                lr.append(lr_scheduled)
+        return lr
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['next_m'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['next_v'] = torch.zeros_like(p.data)
+
+                next_m, next_v = state['next_m'], state['next_v']
+                beta1, beta2 = group['b1'], group['b2']
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                next_m.mul_(beta1).add_(grad, alpha=1 - beta1)
+                next_v.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                update = next_m / (next_v.sqrt() + group['e'])
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data
+
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+
+                update_with_lr = lr_scheduled * update
+                p.data.add_(-update_with_lr)
+
+                state['step'] += 1
+
+                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+                # No bias correction
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+
+        return loss
diff --git a/baselines/crossmodal_moment_localization/scripts/eval.sh b/baselines/crossmodal_moment_localization/scripts/eval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e75c03d2de065a0099d704482c77af481e127e8c
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/scripts/eval.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/eval.sh ANY_OTHER_PYTHON_ARGS
+eval_split_name=$1
+submission_path=$2
+save_path=$3
+gt_path=data/tvr_${eval_split_name}_release.jsonl
+
+python standalone_eval/eval.py \
+--gt_path ${gt_path} \
+--submission_path ${submission_path} \
+--save_path ${save_path} \
+${@:4}
diff --git a/baselines/crossmodal_moment_localization/scripts/inference.sh b/baselines/crossmodal_moment_localization/scripts/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cabae575b68fb445567e9b52d4f5c4675022a82e
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/scripts/inference.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+model_dir=$1
+eval_split_name=$2
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+tasks=()
+tasks+=(VCMR)
+tasks+=(SVMR)
+tasks+=(VR)
+echo "tasks ${tasks[@]}"
+python baselines/crossmodal_moment_localization/inference.py \
+--model_dir ${model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+${@:3}
diff --git a/baselines/crossmodal_moment_localization/scripts/inference_with_external.sh b/baselines/crossmodal_moment_localization/scripts/inference_with_external.sh
new file mode 100644
index 0000000000000000000000000000000000000000..20bc039a01ca3dfe08744d8a61c88791678f4e3f
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/scripts/inference_with_external.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/inference_with_external.sh
+#model_dir=$1
+# DO not use NMS, since it gives worse results
+eval_model=$1  # [xml, xml_tef]
+eval_split_name=$2
+external_model=mee  # [mee, mcn, cal]
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+project_root=./baselines
+
+# setup eval model
+if [[ ${eval_model} == xml ]]; then
+    eval_model_dir=tvr-video_sub-resnet_i3d_no_norm_v-2019_11_03_12_22_19
+elif [[ ${eval_model} == xml_tef ]]; then
+    eval_model_dir=tvr-video_sub_tef-resnet_i3d_no_norm_v-2019_11_03_12_53_01
+fi
+
+# setup external
+if [[ ${external_model} == mee ]]; then
+    external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39
+    external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json
+fi
+
+tasks=(VR)
+tasks+=(SVMR)
+tasks+=(VCMR)
+echo "tasks ${tasks[@]}"
+python baselines/crossmodal_moment_localization/inference.py \
+--model_dir ${eval_model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+--external_inference_vr_res_path ${external_inference_vr_res_path} \
+--eval_id ${external_model_dir} \
+${@:3}
+
+#--use_intermediate \  # temporary removed
+
diff --git a/baselines/crossmodal_moment_localization/scripts/train.sh b/baselines/crossmodal_moment_localization/scripts/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4213ede6e5ef6c855b33d73b2795fea4f9cb9656
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/scripts/train.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/crossmodal_moment_localization/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS
+# use --eval_tasks_at_training ["VR", "SVMR", "VCMR"] --stop_task ["VR", "SVMR", "VCMR"] for
+# use --lw_neg_q 0 --lw_neg_ctx 0 for training SVMR/SVMR only
+# use --lw_st_ed 0 for training with VR only
+dset_name=$1  # see case below
+ctx_mode=$2  # [video, sub, tef, video_sub, video_tef, sub_tef, video_sub_tef]
+vid_feat_type=$3  # [resnet, i3d, resnet_i3d]
+feature_root=data/tvr_feature_release
+results_root=baselines/crossmodal_moment_localization/results
+vid_feat_size=2048
+extra_args=()
+
+if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+    if [[ ${dset_name} != "tvr" ]]; then
+        echo "The use of subtitles is only supported in tvr."
+        exit 1
+    fi
+fi
+
+
+case ${dset_name} in
+    tvr)
+        train_path=data/tvr_train_release.jsonl
+        corpus_path=data/tvr_video2dur_idx.json
+        desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5
+        if [[ ${vid_feat_type} == "i3d" ]]; then
+            echo "Using I3D feature with shape 1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5
+            vid_feat_size=1024
+        elif [[ ${vid_feat_type} == "resnet" ]]; then
+            echo "Using ResNet feature with shape 2048"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+            vid_feat_size=2048
+        elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then
+            echo "Using concatenated ResNet and I3D feature with shape 2048+1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5
+            vid_feat_size=3072
+            extra_args+=(--no_norm_vfeat)  # since they are already normalized.
+        fi
+        eval_split_name=val
+        nms_thd=-1
+        extra_args+=(--eval_path)
+        extra_args+=(data/tvr_val_release.jsonl)
+        clip_length=1.5
+        extra_args+=(--max_ctx_l)
+        extra_args+=(100)  # max_ctx_l = 100 for clip_length = 1.5, only ~109/21825 has more than 100.
+        extra_args+=(--max_pred_l)
+        extra_args+=(16)
+        if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+            echo "Running with sub."
+            desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5  # overwrite
+            sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5
+            sub_feat_size=768
+            extra_args+=(--sub_feat_size)
+            extra_args+=(${sub_feat_size})
+            extra_args+=(--sub_bert_path)
+            extra_args+=(${sub_bert_path})
+        fi
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+
+echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]"
+echo "Extra args ${extra_args[@]}"
+echo " python baselines/crossmodal_moment_localization/train.py     --dset_name=${dset_name}     --eval_split_name=${eval_split_name}     --nms_thd=${nms_thd}     --results_root=${results_root}     --train_path=${train_path}     --desc_bert_path=${desc_bert_path}     --corpus_path=${corpus_path}     --vid_feat_path=${vid_feat_path}     --clip_length=${clip_length}     --vid_feat_size=${vid_feat_size}     --ctx_mode=${ctx_mode}     ${extra_args[@]}     ${@:4}"
\ No newline at end of file
diff --git a/baselines/crossmodal_moment_localization/start_end_dataset.py b/baselines/crossmodal_moment_localization/start_end_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c48735b97c9611b0113ae47bc6f67f7da640055d
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/start_end_dataset.py
@@ -0,0 +1,393 @@
+"""
+Dataset for clip model
+"""
+import logging
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import h5py
+import time
+import math
+import random
+from tqdm import tqdm
+from utils.basic_utils import load_json, load_json, l2_normalize_np_array, flat_list_of_lists, merge_dicts
+from utils.tensor_utils import pad_sequences_1d
+from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \
+    get_didemo_agreed_ts
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+
+class StartEndDataset(Dataset):
+    """
+    Args:
+        dset_name, str, ["tvr"]
+        ctx_mode: str,
+    Return:
+        a dict: {
+            "meta": {
+                "query_id": int,
+                "desc": str,
+                "vid_name": str,
+                "duration": float,
+                "ts": [st (float), ed (float)], seconds, ground_truth timestamps
+            }
+            "model_inputs": {
+                "query_feat": torch.tensor, (L, D_q)
+                "video_feat": torch.tensor, (n_clip_in_moment, D_video)
+                "sub_feat": torch.tensor, (n_clip_in_moment, D_sub)
+                "st_ed_indices": torch.LongTensor, (2, )
+            }
+        }
+    """
+    def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler,
+                 max_desc_len, max_ctx_len,
+                 vid_feat_path_or_handler, clip_length, ctx_mode="video",
+                 normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0):
+        self.dset_name = dset_name
+        self.data_path = data_path
+        self.data_ratio = data_ratio
+
+        self.desc_bert_path_or_handler = desc_bert_path_or_handler
+        self.max_desc_len = max_desc_len
+
+        self.sub_bert_path_or_handler = sub_bert_path_or_handler
+        self.max_ctx_len = max_ctx_len
+        self.vid_feat_path_or_handler = vid_feat_path_or_handler
+        self.clip_length = clip_length
+        self.ctx_mode = ctx_mode
+
+        # prepare desc data
+        self.data = self.expand_annotations(load_json(data_path))
+
+        if self.data_ratio != 1:
+            n_examples = int(len(self.data) * data_ratio)
+            self.data = self.data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+
+    def __len__(self):
+        return len(self.data)
+
+    def expand_annotations(self, annotations):
+        new_annotations = []
+        for i in annotations:
+            query = i["query"]
+            query_id = i["query_id"]
+            for moment in  i["relevant_moment"]:
+                moment.update({'query': query, 'query_id': query_id})
+                new_annotations.append(moment)
+        return new_annotations
+    
+    def __getitem__(self, index):
+        raw_data = self.data[index]
+
+        # initialize with basic data
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["query"],
+            vid_name=raw_data["video_name"],
+            duration=raw_data["duration"],
+            ts=raw_data["timestamp"] ,
+        )
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+
+        ctx_l = 0
+        if self.use_video:
+            video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clip, D)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+            ctx_l = len(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clips, D_t)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+            ctx_l = len(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+
+        if self.use_tef:
+            # note the tef features here are normalized clip indices (1.5 secs), instead of the original time (1 sec)
+            ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
+            tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
+            tef_ed = tef_st + 1.0 / ctx_l
+            tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
+            model_inputs["tef_feat"] = tef
+        else:
+            model_inputs["tef_feat"] = torch.zeros((2, 2))
+
+        if self.use_video and self.use_tef:
+            model_inputs["video_feat"] = torch.cat(
+                [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D+2)
+        if self.use_sub and self.use_tef:
+            model_inputs["sub_feat"] = torch.cat(
+                [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D_t+2)
+
+        model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], max_idx=ctx_l-1)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+
+        Returns:
+            [st_idx, ed_idx]: int,
+
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        # TODO which one is better, [2: 5] or [2: 6)
+        """
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx)
+        return torch.LongTensor([st_idx, ed_idx])
+
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+
+
+class StartEndEvalDataset(Dataset):
+    """
+    init_data_mode: `video_query` or `video_only` or `query_only`,
+        it indicates which data to load when initialize the Dataset object.
+    data_mode: `context` or `query`, it indicates which data to return for self.__get_item__()
+    desc_bert_path_or_handler: h5py.File object or str path
+    vid_feat_path_or_handler: h5py.File object or str path
+    eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with
+        max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals.
+    load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval.
+    data_ratio: percentage of query data to use.
+    """
+    def __init__(self, data_path=None,
+                 desc_bert_path_or_handler=None, max_desc_len=None,  max_ctx_len=None,
+                 sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
+                 corpus_path=None, clip_length=None,
+                 ctx_mode="video", data_mode="context",
+                 h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True):
+        self.ctx_mode = ctx_mode
+        self.load_gt_video = False
+        self.data_ratio = data_ratio  # only affect query data
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+
+        self.data_mode = None
+        self.set_data_mode(data_mode)
+
+        self.max_desc_len = max_desc_len
+        self.max_ctx_len = max_ctx_len
+        self.data_path = data_path
+        
+
+        self.annotations = load_json(data_path)
+        self.ground_truth = self.get_relevant_moment_gt()
+
+        
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+
+        video_data = load_json(corpus_path)
+        self.video_data = [{"vid_name": k, "duration": v} for k, v in video_data.items()]
+        self.video2idx = {k: v for k, v in video_data.items()}
+        self.clip_length = clip_length
+
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+
+
+    def get_relevant_moment_gt(self):
+        gt_all = {}
+        for data in self.annotations:
+            gt_all[data["query_id"]] = data["relevant_moment"]
+        return gt_all
+
+    def set_data_mode(self, data_mode):
+        """context or query"""
+        assert data_mode in ["context", "query"]
+        self.data_mode = data_mode
+
+    # def load_gt_vid_name_for_query(self, load_gt_video):
+    #     """load_gt_video: bool, affect the returned value of self._get_item_query"""
+    #     if load_gt_video:
+    #         assert "vid_name" in self.query_data[0]
+    #     self.load_gt_video = load_gt_video
+
+    def __len__(self):
+        if self.data_mode == "context":
+            return len(self.video_data)
+        else:
+            return len(self.annotations)
+
+    def __getitem__(self, index):
+        if self.data_mode == "context":
+            return self._get_item_context(index)
+        else:
+            return self._get_item_query(index)
+
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+
+    def _get_item_query(self, index):
+        """Need to batch"""
+        raw_data = self.annotations[index]
+
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["query"],
+            vid_name=raw_data["video_name"] if self.load_gt_video else None
+        )
+
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+
+        Returns:
+            [st_idx, ed_idx]: int,
+
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        Given ts = [5, 9], st_idx = 3, ed_idx = 6,
+        clips should be indexed as [3: 6), the translated back ts should be [4.5:9].
+        # TODO which one is better, [2: 5] or [2: 6)
+        """
+        # TODO ed_idx -= 1, should also modify relevant code in inference.py
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length) - 1, max_idx)  # st_idx could be the same as ed_idx
+        return torch.LongTensor([st_idx, ed_idx])
+
+    def _get_item_context(self, index):
+        """No need to batch, since it has already been batched here"""
+        raw_data = self.video_data[index]
+
+        # initialize with basic data
+        meta = dict(
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+        )
+
+        model_inputs = dict()
+        ctx_l = 0
+
+        if self.use_video:
+            video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clip, D)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+            ctx_l = len(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clips, D_t)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+            ctx_l = len(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+
+        if self.use_tef:
+            ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
+            tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
+            tef_ed = tef_st + 1.0 / ctx_l
+            tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
+            model_inputs["tef_feat"] = tef
+        else:
+            model_inputs["tef_feat"] = torch.zeros((2, 2))
+
+        if self.use_video and self.use_tef:
+            model_inputs["video_feat"] = torch.cat(
+                [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D+2)
+        if self.use_sub and self.use_tef:
+            model_inputs["sub_feat"] = torch.cat(
+                [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D_t+2)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+
+def start_end_collate(batch):
+    batch_meta = [e["meta"] for e in batch]  # seems no need to collate ?
+
+    model_inputs_keys = batch[0]["model_inputs"].keys()
+    batched_data = dict()
+    for k in model_inputs_keys:
+        if "feat" in k:
+            batched_data[k] = pad_sequences_1d(
+                [e["model_inputs"][k] for e in batch], dtype=torch.float32, fixed_length=None)
+
+    if "st_ed_indices" in model_inputs_keys:
+        batched_data["st_ed_indices"] = torch.stack(
+            [e["model_inputs"]["st_ed_indices"] for e in batch], dim=0)
+    return batch_meta, batched_data
+
+
+def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False):
+    model_inputs = {}
+    for k, v in batched_model_inputs.items():
+        if "feat" in k:
+            model_inputs[k] = v[0].to(device, non_blocking=non_blocking)
+            model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking)
+        else:
+            model_inputs[k] = v.to(device, non_blocking=non_blocking)
+    return model_inputs
+
+
+if __name__ == '__main__':
+    from baselines.crossmodal_moment_localization.config import BaseOptions
+    options = BaseOptions().parse()
diff --git a/baselines/crossmodal_moment_localization/train.py b/baselines/crossmodal_moment_localization/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a79f66e1b5368e9de6bc99d110ec3ab96c8963b
--- /dev/null
+++ b/baselines/crossmodal_moment_localization/train.py
@@ -0,0 +1,226 @@
+import os
+import sys
+sys.path.append("..")
+sys.path.append(".")
+import time
+import json
+import pprint
+import random
+import numpy as np
+from easydict import EasyDict as EDict
+from tqdm import tqdm, trange
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from utils.basic_utils import save_json
+
+from baselines.crossmodal_moment_localization.config import BaseOptions
+from baselines.crossmodal_moment_localization.model_xml import XML
+from baselines.crossmodal_moment_localization.start_end_dataset import \
+    StartEndDataset, start_end_collate, StartEndEvalDataset, prepare_batch_inputs
+from baselines.crossmodal_moment_localization.inference import eval_epoch, start_inference
+from baselines.crossmodal_moment_localization.optimization import BertAdam
+from utils.basic_utils import AverageMeter, get_logger
+from utils.model_utils import count_parameters
+
+def get_eval_data(opt, data_path, data_mode):
+    dataset = StartEndEvalDataset(
+        data_path=data_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path if "sub" in opt.ctx_mode else None,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        corpus_path=opt.corpus_path,
+        vid_feat_path_or_handler=opt.vid_feat_path if "video" in opt.ctx_mode else None,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        data_mode=data_mode,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat)
+    return dataset
+
+
+def set_seed(seed, use_cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if use_cuda:
+        torch.cuda.manual_seed_all(seed)
+
+
+def rm_key_from_odict(odict_obj, rm_suffix):
+    """remove key entry from the OrderedDict"""
+    return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k])
+
+
+def train(model, train_dataset, val_data, test_data, context_data, opt, logger):
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+
+    train_loader = DataLoader(train_dataset,
+                              collate_fn=start_end_collate,
+                              batch_size=opt.bsz,
+                              num_workers=opt.num_workers,
+                              shuffle=True,
+                              pin_memory=opt.pin_memory)
+
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
+        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
+    ]
+
+    num_train_optimization_steps = len(train_loader) * opt.n_epoch
+    optimizer = BertAdam(optimizer_grouped_parameters,
+                         lr=opt.lr,
+                         weight_decay=opt.wd,
+                         warmup=opt.lr_warmup_proportion,
+                         t_total=num_train_optimization_steps,
+                         schedule="warmup_linear")
+    thresholds = [0.3, 0.5, 0.7]
+    topks = [10, 20, 40]
+    best_val_ndcg = 0
+    for epoch_i in range(0, opt.n_epoch):
+        print(f"TRAIN EPOCH: {epoch_i}|{opt.n_epoch}")
+        eval_step = len(train_loader) // opt.eval_num_per_epoch
+        if opt.hard_negtiave_start_epoch != -1 and epoch_i >= opt.hard_negtiave_start_epoch:
+            model.set_hard_negative(True, opt.hard_pool_size)
+        if opt.train_span_start_epoch != -1 and epoch_i >= opt.train_span_start_epoch:
+            model.set_train_st_ed(opt.lw_st_ed)
+
+        num_training_examples = len(train_loader)
+        for batch_idx, batch in tqdm(enumerate(train_loader),
+                                    desc="Training Iteration",
+                                    total=num_training_examples):
+            global_step = epoch_i * num_training_examples + batch_idx + 1
+            model.train(mode=True)
+
+            # continue
+            model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory)
+            loss, loss_dict = model(**model_inputs)
+            optimizer.zero_grad()
+            loss.backward()
+            if opt.grad_clip != -1:
+                nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
+            optimizer.step()
+
+            if global_step % eval_step == 0 or batch_idx == len(train_loader):
+                model.eval()
+                with torch.no_grad():
+                    val_performance, val_predictions = eval_epoch(model, val_data, context_data, logger, opt,  max_after_nms=40, iou_thds=thresholds, topks=topks)
+                    test_performance, test_predictions = eval_epoch(model, test_data, context_data, logger, opt,  max_after_nms=40, iou_thds=thresholds, topks=topks)
+                    logger.info(f"EPOCH: {epoch_i}")
+                    anchor_ndcg = 0
+                    line1 = ""
+                    line2 = "VAL: "
+                    line3 = "TEST: "
+                    anchor_ndcg = val_performance[20][0.5]
+                    for K, vs in val_performance.items():
+                        for T, v in vs.items():
+                            line1 += f"NDCG@{K}, IoU={T}\t"
+                            line2 += f" {v:.6f}"
+                            
+                    for K, vs in test_performance.items():
+                        for T, v in vs.items():
+                            line3 += f" {v:.6f}"
+                logger.info(line1)
+                logger.info(line2)
+                logger.info(line3)
+                
+                
+                if anchor_ndcg > best_val_ndcg:
+                    print("~"*40)
+                    save_json(val_predictions, os.path.join(opt.results_dir, "best_val_predictions.json"))
+                    save_json(test_predictions, os.path.join(opt.results_dir, "best_test_predictions.json"))
+                    best_val_ndcg = anchor_ndcg
+                    logger.info("BEST " + line2)
+                    logger.info("BEST " + line3)
+                    checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i}
+                    torch.save(checkpoint, opt.ckpt_filepath)
+                    logger.info("save checkpoint: {}".format(opt.ckpt_filepath))
+                    print("~"*40)
+
+                logger.info("")
+                    
+
+
+def main():
+    opt = BaseOptions().parse()
+    set_seed(opt.seed)
+    logger = get_logger(opt.results_dir, opt.model_name +"_"+ opt.exp_id)
+    train_dataset = StartEndDataset(
+        dset_name=opt.dset_name,
+        data_path=opt.train_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+    )
+
+    context_data = get_eval_data(opt, opt.val_path, data_mode="context")
+    val_data = get_eval_data(opt, opt.val_path, data_mode="query")
+    test_data = get_eval_data(opt, opt.test_path, data_mode="query")
+    
+
+
+    model_config = EDict(
+        merge_two_stream=not opt.no_merge_two_stream,  # merge video and subtitles
+        cross_att=not opt.no_cross_att,  # use cross-attention when encoding video and subtitles
+        span_predictor_type=opt.span_predictor_type,  # span_predictor_type
+        encoder_type=opt.encoder_type,  # gru, lstm, transformer
+        add_pe_rnn=opt.add_pe_rnn,  # add pe for RNNs
+        pe_type=opt.pe_type,  #
+        visual_input_size=opt.vid_feat_size,
+        sub_input_size=opt.sub_feat_size,  # for both desc and subtitles
+        query_input_size=opt.q_feat_size,  # for both desc and subtitles
+        hidden_size=opt.hidden_size,  #
+        stack_conv_predictor_conv_kernel_sizes=opt.stack_conv_predictor_conv_kernel_sizes,  #
+        conv_kernel_size=opt.conv_kernel_size,
+        conv_stride=opt.conv_stride,
+        max_ctx_l=opt.max_ctx_l,
+        max_desc_l=opt.max_desc_l,
+        input_drop=opt.input_drop,
+        cross_att_drop=opt.cross_att_drop,
+        drop=opt.drop,
+        n_heads=opt.n_heads,  # self-att heads
+        initializer_range=opt.initializer_range,  # for linear layer
+        ctx_mode=opt.ctx_mode,  # video, sub or video_sub
+        margin=opt.margin,  # margin for ranking loss
+        ranking_loss_type=opt.ranking_loss_type,  # loss type, 'hinge' or 'lse'
+        lw_neg_q=opt.lw_neg_q,  # loss weight for neg. query and pos. context
+        lw_neg_ctx=opt.lw_neg_ctx,  # loss weight for pos. query and neg. context
+        lw_st_ed=0,  # will be assigned dynamically at training time
+        use_hard_negative=False,  # reset at each epoch
+        hard_pool_size=opt.hard_pool_size,
+        use_self_attention=not opt.no_self_att,  # whether to use self attention
+        no_modular=opt.no_modular
+    )
+    logger.info("model_config {}".format(model_config))
+    model = XML(model_config)
+    count_parameters(model)
+    logger.info("Start Training...")
+    train(model, train_dataset, val_data, test_data, context_data, opt, logger)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/baselines/excl/README.md b/baselines/excl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..936d63ed4ba56c2db49f2dd879fb04a8e103983e
--- /dev/null
+++ b/baselines/excl/README.md
@@ -0,0 +1,25 @@
+# Extractive Clip Localization (ExCL)
+ 
+This folder contains the model described in the paper
+```
+@article{ghosh2019excl,
+  title={ExCL: Extractive Clip Localization Using Natural Language Descriptions},
+  author={Ghosh, Soham and Agarwal, Anuva and Parekh, Zarana and Hauptmann, Alexander},
+  journal={NAACL},
+  year={2019}
+}
+```
+
+It also resembles the model in
+```
+@article{lei2019tvqa+,
+  title={TVQA+: Spatio-Temporal Grounding for Video Question Answering},
+  author={Lei, Jie and Yu, Licheng and Berg, Tamara L and Bansal, Mohit},
+  journal={arXiv preprint arXiv:1904.11574},
+  year={2019}
+}
+```
+
+Disclaimer: This code is implemented by [Jie Lei](http://www.cs.unc.edu/~jielei/) for the TVR dataset, 
+it does not guarantee the reproducibility of the original authors' results.
+
diff --git a/baselines/excl/__init__.py b/baselines/excl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/baselines/excl/config.py b/baselines/excl/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad829c2d0634df6df19d47d08eb9b2034279ec2
--- /dev/null
+++ b/baselines/excl/config.py
@@ -0,0 +1,271 @@
+import os
+import time
+import torch
+import argparse
+
+from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
+from baselines.clip_alignment_with_language.local_utils.proposal import ProposalConfigs
+
+
+class BaseOptions(object):
+    saved_option_filename = "opt.json"
+    ckpt_filename = "model.ckpt"
+    tensorboard_log_dir = "tensorboard_log"
+    train_log_filename = "train.log.txt"
+    eval_log_filename = "eval.log.txt"
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        self.initialized = False
+        self.opt = None
+
+    def initialize(self):
+        self.initialized = True
+        self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
+        self.parser.add_argument("--eval_split_name", type=str, default="val",
+                                 help="should match keys in corpus_path, must set for VCMR")
+        self.parser.add_argument("--debug", action="store_true",
+                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
+        self.parser.add_argument("--data_ratio", type=float, default=1.0,
+                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
+                                      "Use small portion for debug purposes. Note this is different from --debug, "
+                                      "which works by breaking the loops, typically they are not used together.")
+        self.parser.add_argument("--results_root", type=str, default="results")
+        self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
+        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
+        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
+        self.parser.add_argument("--num_workers", type=int, default=8,
+                                 help="num subprocesses used to load the data, 0: use main process")
+        self.parser.add_argument("--no_core_driver", action="store_true",
+                                 help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
+        self.parser.add_argument("--no_pin_memory", action="store_true",
+                                 help="Don't use pin_memory=True for dataloader. "
+                                      "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")
+
+        # training config
+        self.parser.add_argument("--lr", type=float, default=1e-3, help="learning rate")
+        self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01,
+                                 help="Proportion of training to perform linear learning rate warmup for. "
+                                      "E.g., 0.1 = 10% of training.")
+        self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
+        self.parser.add_argument("--n_epoch", type=int, default=30, help="number of epochs to run")
+        self.parser.add_argument("--max_es_cnt", type=int, default=10,
+                                 help="number of epochs to early stop, use -1 to disable early stop")
+        self.parser.add_argument("--stop_task", type=str, default="SVMR", choices=["VCMR", "SVMR", "VR"])
+        self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+",
+                                 default=["SVMR"], choices=["VCMR", "SVMR", "VR"],
+                                 help="evaluate and report  numbers for tasks specified here.")
+        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+        self.parser.add_argument("--eval_query_bsz", type=int, default=50,
+                                 help="mini-batch size at inference, for query")
+        self.parser.add_argument("--eval_context_bsz", type=int, default=200,
+                                 help="mini-batch size at inference, for video/sub")
+        self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
+        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
+        self.parser.add_argument("--margin", type=float, default=0.1, help="margin for   hinge loss")
+        self.parser.add_argument("--lw_neg_q", type=float, default=1,
+                                 help="weight for ranking loss with negative query and positive context")
+        self.parser.add_argument("--lw_neg_ctx", type=float, default=1,
+                                 help="weight for ranking loss with positive query and negative context")
+        self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss")
+        self.parser.add_argument("--train_span_start_epoch", type=int, default=0,
+                                 help="which epoch to start training span prediction, -1 to disable")
+        self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"],
+                                 help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
+        self.parser.add_argument("--hard_negtiave_start_epoch", type=int, default=20,
+                                 help="which epoch to start hard negative sampling for video-level ranking loss,"
+                                      "use -1 to disable")
+        self.parser.add_argument("--hard_pool_size", type=int, default=20,
+                                 help="hard negatives are still sampled, but from a harder pool.")
+
+        # Model and Data config
+        self.parser.add_argument("--max_sub_l", type=int, default=50,
+                                 help="max length of all sub sentence 97.71 under 50 for 3 sentences")
+        self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
+        self.parser.add_argument("--max_ctx_l", type=int, default=100,
+                                 help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
+
+        self.parser.add_argument("--train_path", type=str, default=None)
+        self.parser.add_argument("--eval_path", type=str, default=None,
+                                 help="Evaluating during training, for Dev set. If None, will only do training, "
+                                      "anet_cap and charades_sta has no dev set, so None")
+        self.parser.add_argument("--use_glove", action="store_true", help="Use GloVe instead of BERT features")
+        self.parser.add_argument("--word2idx_path", type=str,
+                                 help="a dict, {word: word_idx, ...}, "
+                                      "special tokens are {<pad>: 0, <unk>: 1, <eos>: 2}")
+        self.parser.add_argument("--vocab_size", type=int, default=-1,
+                                 help="Set automatically to len(word2idx)")
+        self.parser.add_argument("--glove_path", type=str,
+                                 help="path to file containing the GloVe embeddings for words in word2idx")
+        self.parser.add_argument("--desc_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--ctx_mode", type=str, choices=["video", "sub", "video_sub", "tef",
+                                                                  "video_tef", "sub_tef", "video_sub_tef"],
+                                 help="which context to use. a combination of [video, sub, tef]")
+        self.parser.add_argument("--corpus_path", type=str, default=None)
+        self.parser.add_argument("--vid_feat_path", type=str, default="")
+        self.parser.add_argument("--no_norm_vfeat", action="store_true",
+                                 help="Do not do normalization on video feat, use it when using i3d_resnet concat feat")
+        self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
+        self.parser.add_argument("--clip_length", type=float, default=None,
+                                 help="each video will be uniformly segmented into small clips, "
+                                      "will automatically loaded from ProposalConfigs if None")
+        self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
+
+        self.parser.add_argument("--external_inference_vr_res_path", type=str, default=None,
+                                 help="if set, use external video retrieval results to guide evaluation. ")
+        self.parser.add_argument("--span_predictor_type", type=str, default="conv", choices=["conv", "cat_linear"],
+                                 help="how to generate span predictions, "
+                                      "conv: apply 1D-Conv layer on top of NxL dot product of query and clips"
+                                      "cat_linear: cat the query and clips then use a linear layer to give output. "
+                                      "Note cat_linear is implemented as first project query and clips into scores, "
+                                      "separately, then sum them up, this should be similar to first cat then project.")
+        self.parser.add_argument("--encoder_type", type=str, default="transformer",
+                                 choices=["gru", "lstm", "transformer", "cnn"])
+        self.parser.add_argument("--add_pe_rnn", action="store_true",
+                                 help="Add positional encoding for GRU and LSTM encoder as well")
+        self.parser.add_argument("--no_merge_two_stream", action="store_true", help="do not merge video and subtitles")
+        self.parser.add_argument("--no_cross_att", action="store_true",
+                                 help="Use cross-attention for modeling video and subtitles")
+        self.parser.add_argument("--no_self_att", action="store_true", help="do not use self attention")
+        self.parser.add_argument("--no_modular", action="store_true", help="do not use modular attention")
+        self.parser.add_argument("--pe_type", type=str, default="cosine", choices=["none", "linear", "cosine"],
+                                 help="Only for query encoding")
+        self.parser.add_argument("--max_position_embeddings", type=int, default=300)
+        self.parser.add_argument("--hidden_size", type=int, default=128)
+        self.parser.add_argument("--n_heads", type=int, default=4)
+        self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs")
+        self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers")
+        self.parser.add_argument("--cross_att_drop", type=float, default=0.1, help="Applied to cross-att")
+        self.parser.add_argument("--conv_kernel_size", type=int, default=5)
+        self.parser.add_argument("--conv_stride", type=int, default=1)
+        self.parser.add_argument("--initializer_range", type=float, default=0.02,
+                                 help="initializer range for linear layer")
+
+        # post processing
+        self.parser.add_argument("--min_pred_l", type=int, default=2,
+                                 help="constrain the [st, ed] with ed - st >= 2"
+                                      "(2 clips with length 1.5 each, 3 secs in total"
+                                      "this is the min length for proposal-based method)")
+        self.parser.add_argument("--max_pred_l", type=int, default=16,
+                                 help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total"
+                                      "(16 clips with length 1.5 each, "
+                                      "this is the max length for proposal-based method)")
+        self.parser.add_argument("--q2c_alpha", type=float, default=20,
+                                 help="give more importance to top scored videos' spans,  "
+                                      "the new score will be: s_new = exp(alpha * s), "
+                                      "higher alpha indicates more importance. Note s in [-1, 1]")
+
+        self.parser.add_argument("--max_before_nms", type=int, default=200)
+        self.parser.add_argument("--max_vcmr_video", type=int, default=100,
+                                 help="re-ranking in top-max_vcmr_video")
+        self.parser.add_argument("--nms_thd", type=float, default=-1,
+                                 help="additionally use non-maximum suppression "
+                                      "(or non-minimum suppression for distance)"
+                                      "to post-processing the predictions. "
+                                      "-1: do not use nms. 0.6 for charades_sta, 0.5 for anet_cap,")
+
+    def display_save(self, opt):
+        args = vars(opt)
+        # Display settings
+        print("------------ Options -------------\n{}\n-------------------"
+              .format({str(k): str(v) for k, v in sorted(args.items())}))
+
+        # Save settings
+        if not isinstance(self, TestOptions):
+            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
+            save_json(args, option_file_path, save_pretty=True)
+
+    def parse(self):
+        if not self.initialized:
+            self.initialize()
+        opt = self.parser.parse_args()
+
+        if opt.debug:
+            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
+            opt.no_core_driver = True
+            opt.num_workers = 0
+            opt.eval_query_bsz = 100
+
+        if isinstance(self, TestOptions):
+            # modify model_dir to absolute path
+            opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
+            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
+            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
+                if arg not in ["results_root", "num_workers", "nms_thd", "debug", "eval_split_name",
+                               "eval_path", "max_pred_l", "min_pred_l"]:
+                    setattr(opt, arg, saved_options[arg])
+            # opt.no_core_driver = True
+        else:
+            if opt.exp_id is None:
+                raise ValueError("--exp_id is required for at a training option!")
+
+            if opt.clip_length is None:
+                opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
+                print("Loaded clip_length {} from proposal config file".format(opt.clip_length))
+            opt.results_dir = os.path.join(opt.results_root,
+                                           "-".join([opt.dset_name, opt.ctx_mode, opt.exp_id,
+                                                     time.strftime("%Y_%m_%d_%H_%M_%S")]))
+            mkdirp(opt.results_dir)
+            # save a copy of current code
+            code_dir = os.path.dirname(os.path.realpath(__file__))
+            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
+            make_zipfile(code_dir, code_zip_filename,
+                         enclosing_dir="code",
+                         exclude_dirs_substring="results",
+                         exclude_dirs=["results", "debug_results", "__pycache__"],
+                         exclude_extensions=[".pyc", ".ipynb", ".swap"],)
+
+        self.display_save(opt)
+
+        if "sub" in opt.ctx_mode:
+            assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
+
+        if opt.hard_negtiave_start_epoch != -1:
+            if opt.hard_pool_size > opt.bsz:
+                print("[WARNING] hard_pool_size is larger than bsz")
+
+        assert opt.stop_task in opt.eval_tasks_at_training
+        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
+        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
+        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
+        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
+        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
+        opt.h5driver = None if opt.no_core_driver else "core"
+        # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
+        opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
+        opt.pin_memory = not opt.no_pin_memory
+
+        if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
+            assert opt.no_norm_vfeat
+
+        if "tef" in opt.ctx_mode and "video" in opt.ctx_mode:
+            opt.vid_feat_size += 2
+        if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode:
+            opt.sub_feat_size += 2
+
+        if "video" not in opt.ctx_mode or "sub" not in opt.ctx_mode:
+            opt.no_merge_two_stream = True
+            opt.no_cross_att = True
+
+        self.opt = opt
+        return opt
+
+
+class TestOptions(BaseOptions):
+    """add additional options for evaluating"""
+    def initialize(self):
+        BaseOptions.initialize(self)
+        # also need to specify --eval_split_name
+        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
+        self.parser.add_argument("--model_dir", type=str,
+                                 help="dir contains the model file, will be converted to absolute path afterwards")
+        self.parser.add_argument("--tasks", type=str, nargs="+",
+                                 choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"],
+                                 help="Which tasks to run."
+                                      "VCMR: Video Corpus Moment Retrieval;"
+                                      "SVMR: Single Video Moment Retrieval;"
+                                      "VR: regular Video Retrieval. (will be performed automatically with VCMR)")
diff --git a/baselines/excl/inference.py b/baselines/excl/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..440cf38613b5563f21299c8303a6e9f216b338df
--- /dev/null
+++ b/baselines/excl/inference.py
@@ -0,0 +1,265 @@
+import os
+import copy
+import math
+import pprint
+from tqdm import tqdm, trange
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+
+from baselines.excl.config import TestOptions
+from baselines.excl.model import EXCL
+from baselines.excl.start_end_dataset import \
+    start_end_collate, ExCLDataset, prepare_batch_inputs
+from baselines.clip_alignment_with_language.inference import \
+    get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms
+from utils.basic_utils import save_json
+from utils.tensor_utils import pad_sequences_1d, find_max_triples, find_max_triples_from_upper_triangle_product
+from standalone_eval.eval import eval_retrieval
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def compute_query2ctx_info_svmr_only(model, eval_dataset, opt,
+                                     max_before_nms=1000, max_n_videos=200, tasks=("SVMR",)):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB
+    max_n_videos: int, use max_n_videos videos for computing VCMR results
+    """
+    model.eval()
+    query_eval_loader = DataLoader(eval_dataset,
+                                   collate_fn=start_end_collate,
+                                   batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers,
+                                   shuffle=False,
+                                   pin_memory=opt.pin_memory)
+    video2idx = eval_dataset.video2idx
+    n_total_query = len(eval_dataset)
+    bsz = opt.eval_query_bsz
+    ctx_len = eval_dataset.max_ctx_len  # all pad to this length
+
+    svmr_gt_st_probs = np.zeros((n_total_query, ctx_len), dtype=np.float32)
+    svmr_gt_ed_probs = np.zeros((n_total_query, ctx_len), dtype=np.float32)
+
+    query_metas = []
+    for idx, batch in tqdm(
+            enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)):
+        _query_metas = batch[0]
+        query_metas.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        _, _, _st_probs, _ed_probs = model(**model_inputs)
+        # normalize to get true probabilities!!!
+        # the probabilities here are already (pad) masked, so only need to do softmax
+        _st_probs = F.softmax(_st_probs, dim=-1)  # (_N_q, L)
+        _ed_probs = F.softmax(_ed_probs, dim=-1)
+
+        svmr_gt_st_probs[idx * bsz:(idx + 1) * bsz, :_st_probs.shape[1]] = _st_probs.cpu().numpy()
+        svmr_gt_ed_probs[idx * bsz:(idx + 1) * bsz, :_ed_probs.shape[1]] = _ed_probs.cpu().numpy()
+
+        if opt.debug:
+            break
+    svmr_res = get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs,
+                                             query_metas, video2idx,
+                                             clip_length=opt.clip_length,
+                                             min_pred_l=opt.min_pred_l,
+                                             max_pred_l=opt.max_pred_l,
+                                             max_before_nms=max_before_nms)
+    return dict(SVMR=svmr_res)
+
+
+def generate_min_max_length_mask(array_shape, min_l, max_l):
+    """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked,
+    below is the case for 4x4.
+    [[0, 1, 1, 0],
+     [0, 0, 1, 1],
+     [0, 0, 0, 1],
+     [0, 0, 0, 0]]
+
+    Args:
+        array_shape: np.shape??? The last two dimensions should be the same
+        min_l: int, minimum length of predicted span
+        max_l: int, maximum length of predicted span
+
+    Returns:
+
+    """
+    single_dims = (1, ) * (len(array_shape) - 2)
+    mask_shape = single_dims + array_shape[-2:]
+    extra_length_mask_array = np.ones(mask_shape, dtype=np.float32)  # (1, ..., 1, L, L)
+    mask_triu = np.triu(extra_length_mask_array, k=min_l)
+    mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l)
+    final_prob_mask = mask_triu * mask_triu_reversed
+    return final_prob_mask  # with valid bit to be 1
+
+
+def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx,
+                                  clip_length, min_pred_l, max_pred_l, max_before_nms):
+    """
+    Args:
+        svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1]
+        svmr_gt_ed_probs:
+        query_metas:
+        video2idx:
+        clip_length: float, how long each clip is in seconds
+        min_pred_l: int, minimum number of clips
+        max_pred_l: int, maximum number of clips
+        max_before_nms: get top-max_before_nms predictions for each query
+
+    Returns:
+
+    """
+    svmr_res = []
+    query_vid_names = [e["vid_name"] for e in query_metas]
+
+    # masking very long ones! Since most are relatively short.
+    st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs)  # (N, L, L)
+    # extra_length_mask_array = np.ones(st_ed_prob_product.shape, dtype=bool)  # (N, L, L)
+    # mask_triu = np.triu(extra_length_mask_array, k=min_pred_l)
+    # mask_triu_reversed = np.logical_not(np.triu(extra_length_mask_array, k=max_pred_l))
+    # final_prob_mask = np.logical_and(mask_triu, mask_triu_reversed)  # with valid bit to be 1
+    valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l)
+    st_ed_prob_product *= valid_prob_mask  # invalid location will become zero!
+
+    batched_sorted_triples = find_max_triples_from_upper_triangle_product(
+        st_ed_prob_product, top_n=max_before_nms, prob_thd=None)
+    for i, q_vid_name in tqdm(enumerate(query_vid_names),
+                              desc="[SVMR] Loop over queries to generate predictions",
+                              total=len(query_vid_names)):  # i is query_id
+        q_m = query_metas[i]
+        video_idx = video2idx[q_vid_name]
+        _sorted_triples = batched_sorted_triples[i]
+        _sorted_triples[:, 1] += 1  # as we redefined ed_idx, which is inside the moment.
+        _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()]
+        cur_query_pred = dict(
+            query_id=q_m["query_id"],
+            desc=q_m["desc"],
+            predictions=cur_ranked_predictions
+        )
+        svmr_res.append(cur_query_pred)
+    return svmr_res
+
+
+def get_eval_res(model, eval_dataset, opt, tasks, max_after_nms):
+    """compute and save query and video proposal embeddings"""
+    eval_res = compute_query2ctx_info_svmr_only(model, eval_dataset, opt,
+                                                max_before_nms=opt.max_before_nms,
+                                                max_n_videos=max_after_nms,
+                                                tasks=tasks)
+    eval_res["video2idx"] = eval_dataset.video2idx
+    return eval_res
+
+
+POST_PROCESSING_MMS_FUNC = {
+    "SVMR": post_processing_svmr_nms,
+    "VCMR": post_processing_vcmr_nms
+}
+
+
+def eval_epoch(model, eval_dataset, opt, save_submission_filename,
+               tasks=("SVMR",), max_after_nms=100):
+    """max_after_nms: always set to 100, since the eval script only evaluate top-100"""
+    model.eval()
+    logger.info("Computing scores")
+    eval_submission_raw = get_eval_res(model, eval_dataset, opt, tasks, max_after_nms=max_after_nms)
+
+    IOU_THDS = (0.5, 0.7)
+    logger.info("Saving/Evaluating before nms results")
+    submission_path = os.path.join(opt.results_dir, save_submission_filename)
+    eval_submission = get_submission_top_n(eval_submission_raw, top_n=max_after_nms)
+    save_json(eval_submission, submission_path)
+
+    metrics = eval_retrieval(eval_submission, eval_dataset.data,
+                             iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug)
+    save_metrics_path = submission_path.replace(".json", "_metrics.json")
+    save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False)
+    latest_file_paths = [submission_path, save_metrics_path]
+
+    if opt.nms_thd != -1:
+        logger.info("Performing nms with nms_thd {}".format(opt.nms_thd))
+        eval_submission_after_nms = dict(video2idx=eval_submission_raw["video2idx"])
+        for k, nms_func in POST_PROCESSING_MMS_FUNC.items():
+            if k in eval_submission_raw:
+                eval_submission_after_nms[k] = nms_func(eval_submission_raw[k],
+                                                        nms_thd=opt.nms_thd,
+                                                        max_before_nms=opt.max_before_nms,
+                                                        max_after_nms=max_after_nms)
+
+        logger.info("Saving/Evaluating nms results")
+        submission_nms_path = submission_path.replace(".json", "_nms_thd_{}.json".format(opt.nms_thd))
+        save_json(eval_submission_after_nms, submission_nms_path)
+        metrics_nms = eval_retrieval(eval_submission_after_nms, eval_dataset.data,
+                                     iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug)
+        save_metrics_nms_path = submission_nms_path.replace(".json", "_metrics.json")
+        save_json(metrics_nms, save_metrics_nms_path, save_pretty=True, sort_keys=False)
+        latest_file_paths += [submission_nms_path, save_metrics_nms_path]
+    else:
+        metrics_nms = None
+    return metrics, metrics_nms, latest_file_paths
+
+
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    model = EXCL(checkpoint["model_cfg"])
+    model.load_state_dict(checkpoint["model"])
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+
+
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+    assert opt.eval_path is not None
+    eval_dataset = ExCLDataset(
+        dset_name=opt.dset_name,
+        data_path=opt.eval_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+        corpus_path=opt.corpus_path,
+        eval_split_name=opt.eval_split_name
+    )
+
+    model = setup_model(opt)
+    save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format(
+        opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks))
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename,
+                       tasks=opt.tasks, max_after_nms=100)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+
+if __name__ == '__main__':
+    start_inference()
diff --git a/baselines/excl/inference_with_vcmr.py b/baselines/excl/inference_with_vcmr.py
new file mode 100644
index 0000000000000000000000000000000000000000..240551e309f8cb26f62f2b401e0109e9587a3ea5
--- /dev/null
+++ b/baselines/excl/inference_with_vcmr.py
@@ -0,0 +1,253 @@
+import os
+import copy
+import math
+import pprint
+from tqdm import tqdm, trange
+import numpy as np
+
+import time
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+
+from baselines.excl.config import TestOptions
+from baselines.excl.model import EXCL
+from baselines.excl.start_end_dataset import \
+    start_end_collate, ExCLEvalDataset, prepare_batch_inputs
+from baselines.clip_alignment_with_language.inference import \
+    get_submission_top_n, post_processing_vcmr_nms, post_processing_svmr_nms
+from utils.basic_utils import save_json, load_json, flat_list_of_lists
+from utils.tensor_utils import pad_sequences_1d, find_max_triples, find_max_triples_from_upper_triangle_product
+from standalone_eval.eval import eval_retrieval
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def load_external_vr_res_with_scores(external_vr_res_path, top_n_vr_videos=5):
+    """return a mapping from query_id to top retrieved (vid_name, score)"""
+    external_vr_res = load_json(external_vr_res_path)
+    external_vr_res = get_submission_top_n(external_vr_res, top_n=top_n_vr_videos)["VR"]
+    query2video = {e["query_id"]: [[sub_e[0], sub_e[3]] for sub_e in e["predictions"]] for e in external_vr_res}
+    return query2video
+
+
+def compute_query2ctx_info(model, eval_dataset, opt,
+                           max_before_nms=1000, max_n_videos=200, tasks=("SVMR",)):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated size 20,000 (query) * 500 (hsz) * 4 / (1024**2) = 38.15 MB
+    max_n_videos: int, use max_n_videos videos for computing VCMR results
+    """
+    model.eval()
+    eval_dataset.set_data_mode("query")
+
+    logger.info("Using external VR results from {}".format(opt.external_inference_vr_res_path))
+    external_query2video = load_external_vr_res_with_scores(
+        opt.external_inference_vr_res_path, top_n_vr_videos=100)  # {query_id: [(vid_name1, score1), ...]}
+    video2idx = eval_dataset.video2idx
+    idx2video = {v: k for k, v in video2idx.items()}
+    vcmr_res = []
+    for idx, single_query_data in tqdm(enumerate(eval_dataset), desc="query2ctx", total=len(eval_dataset)):
+        single_query_meta = single_query_data["meta"]
+        query_id = single_query_meta["query_id"]
+        vid_names = [idx2video[e[0]] for e in external_query2video[query_id]]
+        bsz = len(vid_names)
+        model_inputs = eval_dataset.get_batched_context(vid_names)[1]
+        model_inputs["st_ed_indices"] = torch.zeros(bsz, 2).long()
+        model_inputs["query_feat"] = (single_query_data["model_inputs"]["query_feat"].unsqueeze(0).repeat(bsz, 1, 1),
+                                      torch.ones(bsz, len(single_query_data["model_inputs"]["query_feat"])))
+        model_inputs = prepare_batch_inputs(model_inputs, device=opt.device, non_blocking=opt.pin_memory)
+        _, _, _st_probs, _ed_probs = model(**model_inputs)
+
+        # normalize to get true probabilities!!!
+        # the probabilities here are already (pad) masked, so only need to do softmax
+        _st_probs = F.softmax(_st_probs, dim=-1)  # (_N_q, L)
+        _ed_probs = F.softmax(_ed_probs, dim=-1)
+
+        vr_scores = _st_probs.new([e[1] for e in external_query2video[query_id]]).unsqueeze(1)  # (N, 1)
+
+        _st_probs = _st_probs * torch.exp(opt.q2c_alpha * vr_scores)
+
+        st_ed_prob_product = torch.einsum("bm,bn->bmn", _st_probs, _ed_probs)  # (Nq, L, L)
+        valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape,
+                                                       min_l=opt.min_pred_l,
+                                                       max_l=opt.max_pred_l)
+        st_ed_prob_product *= st_ed_prob_product.new(valid_prob_mask)  # invalid location will become zero!
+
+        st_ed_prob_product = st_ed_prob_product.cpu().numpy()
+        batched_sorted_triples = find_max_triples_from_upper_triangle_product(
+            st_ed_prob_product, top_n=50, prob_thd=None)
+        # print("batched_sorted_triples", batched_sorted_triples[0][:4])
+        # print("[12, ] + batched_sorted_triples[0][0]", [12, ] + batched_sorted_triples[0][0].tolist())
+        # print("", batched_sorted_triples[0][0].tolist(), type(batched_sorted_triples[0][0].tolist()))
+        batched_spans_with_names = []
+        for vid_name, b in zip(vid_names, batched_sorted_triples):
+            cur_video_idx = video2idx[vid_name]
+            batched_spans_with_names += [[cur_video_idx] + e.tolist() for e in b]
+
+        # print("batched_spans_with_names", len(batched_spans_with_names), batched_spans_with_names[0])
+        cur_vcmr_redictions = sorted(batched_spans_with_names, key=lambda x: x[3], reverse=True)[:max_before_nms]
+        cur_query_pred = dict(
+            query_id=single_query_meta["query_id"],
+            desc=single_query_meta["desc"],
+            predictions=cur_vcmr_redictions)
+        vcmr_res.append(cur_query_pred)
+
+        if opt.debug and idx == 10:
+            break
+    return dict(VCMR=vcmr_res)
+
+
+def generate_min_max_length_mask(array_shape, min_l, max_l):
+    """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked,
+    below is the case for 4x4.
+    [[0, 1, 1, 0],
+     [0, 0, 1, 1],
+     [0, 0, 0, 1],
+     [0, 0, 0, 0]]
+
+    Args:
+        array_shape: np.shape??? The last two dimensions should be the same
+        min_l: int, minimum length of predicted span
+        max_l: int, maximum length of predicted span
+
+    Returns:
+
+    """
+    single_dims = (1, ) * (len(array_shape) - 2)
+    mask_shape = single_dims + array_shape[-2:]
+    extra_length_mask_array = np.ones(mask_shape, dtype=np.float32)  # (1, ..., 1, L, L)
+    mask_triu = np.triu(extra_length_mask_array, k=min_l)
+    mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l)
+    final_prob_mask = mask_triu * mask_triu_reversed
+    return final_prob_mask  # with valid bit to be 1
+
+
+def get_eval_res(model, eval_dataset, opt, tasks, max_after_nms):
+    """compute and save query and video proposal embeddings"""
+    eval_res = compute_query2ctx_info(model, eval_dataset, opt,
+                                      max_before_nms=opt.max_before_nms,
+                                      max_n_videos=max_after_nms,
+                                      tasks=tasks)
+    eval_res["video2idx"] = eval_dataset.video2idx
+    return eval_res
+
+
+POST_PROCESSING_MMS_FUNC = {
+    "SVMR": post_processing_svmr_nms,
+    "VCMR": post_processing_vcmr_nms
+}
+
+
+def eval_epoch(model, eval_dataset, opt, save_submission_filename,
+               tasks=("SVMR",), max_after_nms=100):
+    """max_after_nms: always set to 100, since the eval script only evaluate top-100"""
+    model.eval()
+    logger.info("Computing scores")
+    # logger.info("Start timing")
+    # times = []
+    # for _ in range(3):
+    #     st_time = time.time()
+    eval_submission_raw = get_eval_res(model, eval_dataset, opt, tasks, max_after_nms=max_after_nms)
+        # times += [time.time() - st_time]
+    # times = torch.FloatTensor(times)
+
+    IOU_THDS = (0.5, 0.7)
+    logger.info("Saving/Evaluating before nms results")
+    submission_path = os.path.join(opt.results_dir, save_submission_filename)
+    eval_submission = get_submission_top_n(eval_submission_raw, top_n=max_after_nms)
+    save_json(eval_submission, submission_path)
+
+    metrics = eval_retrieval(eval_submission, eval_dataset.query_data,
+                             iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug)
+    # metrics["time_avg"] = float(times.mean())
+    # metrics["time_std"] = float(times.std())
+    save_metrics_path = submission_path.replace(".json", "_metrics.json")
+    save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False)
+    latest_file_paths = [submission_path, save_metrics_path]
+
+    if opt.nms_thd != -1:
+        logger.info("Performing nms with nms_thd {}".format(opt.nms_thd))
+        eval_submission_after_nms = dict(video2idx=eval_submission_raw["video2idx"])
+        for k, nms_func in POST_PROCESSING_MMS_FUNC.items():
+            if k in eval_submission_raw:
+                eval_submission_after_nms[k] = nms_func(eval_submission_raw[k],
+                                                        nms_thd=opt.nms_thd,
+                                                        max_before_nms=opt.max_before_nms,
+                                                        max_after_nms=max_after_nms)
+
+        logger.info("Saving/Evaluating nms results")
+        submission_nms_path = submission_path.replace(".json", "_nms_thd_{}.json".format(opt.nms_thd))
+        save_json(eval_submission_after_nms, submission_nms_path)
+        metrics_nms = eval_retrieval(eval_submission_after_nms, eval_dataset.query_data,
+                                     iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug)
+        save_metrics_nms_path = submission_nms_path.replace(".json", "_metrics.json")
+        save_json(metrics_nms, save_metrics_nms_path, save_pretty=True, sort_keys=False)
+        latest_file_paths += [submission_nms_path, save_metrics_nms_path]
+    else:
+        metrics_nms = None
+    return metrics, metrics_nms, latest_file_paths
+
+
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    model = EXCL(checkpoint["model_cfg"])
+    model.load_state_dict(checkpoint["model"])
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+
+
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+    assert opt.external_inference_vr_res_path is not None
+
+    assert opt.eval_path is not None
+    eval_dataset = ExCLEvalDataset(
+        dset_name=opt.dset_name,
+        data_path=opt.eval_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+        corpus_path=opt.corpus_path,
+        eval_split_name=opt.eval_split_name
+    )
+
+    model = setup_model(opt)
+    save_submission_filename = "inference_{}_{}_{}_predictions_{}.json".format(
+        opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks))
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename,
+                       tasks=opt.tasks, max_after_nms=100)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+
+if __name__ == '__main__':
+    start_inference()
diff --git a/baselines/excl/model.py b/baselines/excl/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2ef68204674791c02e2f154b0faf9bb822c43b
--- /dev/null
+++ b/baselines/excl/model.py
@@ -0,0 +1,169 @@
+import math
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.model_utils import RNNEncoder
+from easydict import EasyDict as edict
+
+
+excl_base_cfg = edict(
+    visual_input_size=2048,  # changes based on visual input type
+    query_input_size=768,
+    sub_input_size=768,
+    hidden_size=256,  #
+    drop=0.5,  # dropout for other layers
+    ctx_mode="video_sub",  # which context are used. 'video', 'sub' or 'video_sub'
+    initializer_range=0.02,
+)
+
+
+class EXCL(nn.Module):
+    def __init__(self, config):
+        super(EXCL, self).__init__()
+        self.config = config
+        self.use_video = "video" in config.ctx_mode
+        self.use_sub = "sub" in config.ctx_mode
+
+        self.query_encoder = RNNEncoder(
+            word_embedding_size=config.query_input_size,
+            hidden_size=config.hidden_size // 2,
+            bidirectional=True,
+            n_layers=1,
+            rnn_type="lstm",
+            return_outputs=False,
+            return_hidden=True
+        )
+
+        if self.use_video:
+            self.video_encoder = RNNEncoder(
+                word_embedding_size=config.visual_input_size,
+                hidden_size=config.hidden_size // 2,
+                bidirectional=True,
+                n_layers=1,
+                rnn_type="lstm",
+                return_outputs=True,
+                return_hidden=False)
+
+            self.video_encoder2 = RNNEncoder(
+                word_embedding_size=2*config.hidden_size,
+                hidden_size=config.hidden_size // 2,
+                bidirectional=True,
+                n_layers=1,
+                rnn_type="lstm",
+                return_outputs=True,
+                return_hidden=False)
+
+            self.video_st_predictor = nn.Sequential(
+                nn.Linear(3*config.hidden_size, config.hidden_size),
+                nn.Tanh(),
+                nn.Linear(config.hidden_size, 1))
+            self.video_ed_predictor = copy.deepcopy(self.video_st_predictor)
+
+        if self.use_sub:
+            self.sub_encoder = RNNEncoder(
+                word_embedding_size=config.sub_input_size,
+                hidden_size=config.hidden_size // 2,
+                bidirectional=True,
+                n_layers=1,
+                rnn_type="lstm",
+                return_outputs=True,
+                return_hidden=False)
+
+            self.sub_encoder2 = RNNEncoder(
+                word_embedding_size=2*config.hidden_size,
+                hidden_size=config.hidden_size // 2,
+                bidirectional=True,
+                n_layers=1,
+                rnn_type="lstm",
+                return_outputs=True,
+                return_hidden=False)
+
+            self.sub_st_predictor = nn.Sequential(
+                nn.Linear(3*config.hidden_size, config.hidden_size),
+                nn.Tanh(),
+                nn.Linear(config.hidden_size, 1))
+            self.sub_ed_predictor = copy.deepcopy(self.video_st_predictor)
+
+        self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean")
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """ Initialize the weights."""
+
+        def re_init(module):
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                # Slightly different from the TF version which uses truncated_normal for initialization
+                # cf https://github.com/pytorch/pytorch/pull/5617
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            elif isinstance(module, nn.Conv1d):
+                module.reset_parameters()
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+
+        self.apply(re_init)
+
+    def get_prob_single_stream(self, encoded_query, ctx_feat, ctx_mask, module_name=None):
+        ctx_mask_rnn = ctx_mask.sum(1).long()
+        ctx_feat1 = getattr(self, module_name+"_encoder")(
+            F.dropout(ctx_feat, p=self.config.drop, training=self.training),
+            ctx_mask_rnn)[0]  # (N, Lc, D)
+        ctx_feat2 = getattr(self, module_name+"_encoder2")(
+            F.dropout(torch.cat([ctx_feat1, encoded_query], dim=-1), p=self.config.drop, training=self.training),
+            ctx_mask_rnn)[0]  # (N, Lc, D)
+        ctx_feat3 = torch.cat([ctx_feat2, ctx_feat1, encoded_query], dim=2)  # (N, Lc, 3D)
+        st_probs = getattr(self, module_name+"_st_predictor")(ctx_feat3).squeeze()  # (N, Lc)
+        ed_probs = getattr(self, module_name+"_ed_predictor")(ctx_feat3).squeeze()  # (N, Lc)
+        st_probs = mask_logits(st_probs, ctx_mask)
+        ed_probs = mask_logits(ed_probs, ctx_mask)
+        return st_probs, ed_probs
+
+    def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask,
+                tef_feat, tef_mask, st_ed_indices, is_training=True):
+        """
+        Args:
+            query_feat: (N, Lq, Dq)
+            query_mask: (N, Lq)
+            video_feat: (N, Lv, Dv) or None
+            video_mask: (N, Lv) or None
+            sub_feat: (N, Lv, Ds) or None
+            sub_mask: (N, Lv) or None
+            tef_feat: (N, Lv, 2) or None,
+            tef_mask: (N, Lv) or None,
+            st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively.
+            is_training:
+        """
+        query_mask = query_mask.sum(1).long()
+        encoded_query = self.query_encoder(query_feat, query_mask)[1]   # (N, D)
+        encoded_query = encoded_query.unsqueeze(1).repeat(1, video_feat.shape[1], 1)  # (N, Lc, D)
+
+        video_st_prob, video_ed_prob = self.get_prob_single_stream(
+            encoded_query, video_feat, video_mask, module_name="video") if self.use_video else (0, 0)
+
+        sub_st_prob, sub_ed_prob = self.get_prob_single_stream(
+            encoded_query, sub_feat, sub_mask, module_name="sub") if self.use_sub else (0, 0)
+
+        st_prob = (video_st_prob + sub_st_prob) / (self.use_video + self.use_sub)
+        ed_prob = (video_ed_prob + sub_ed_prob) / (self.use_video + self.use_sub)
+
+        if is_training:
+            loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0])
+            loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1])
+            loss_st_ed = loss_st + loss_ed
+
+            return loss_st_ed, {"loss_st_ed": float(loss_st_ed)}, st_prob, ed_prob
+        else:
+            # used to measure the runtime. not useful for other experiments.
+            prob_product = torch.einsum("bm,bn->bmn", st_prob, ed_prob)  # (N, L, L)
+            prob_product = torch.triu(prob_product)  # ()
+            prob_product = prob_product.view(prob_product.shape[0], -1)
+            prob_product = torch.topk(prob_product, k=100, dim=1, largest=True)
+            return None
+
+
+def mask_logits(target, mask):
+    return target * mask + (1 - mask) * (-1e10)
diff --git a/baselines/excl/model_components.py b/baselines/excl/model_components.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ab6ba7d99e105c489089877a1f5ef7d630a5f41
--- /dev/null
+++ b/baselines/excl/model_components.py
@@ -0,0 +1,317 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DepthwiseSeparableConv(nn.Module):
+    """
+    Depth-wise separable convolution uses less parameters to generate output by convolution.
+    :Examples:
+        >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1)
+        >>> input_tensor = torch.randn(32, 300, 20)
+        >>> output = m(input_tensor)
+    """
+
+    def __init__(self, in_ch, out_ch, k, dim=1, relu=True):
+        """
+        :param in_ch: input hidden dimension size
+        :param out_ch: output hidden dimension size
+        :param k: kernel size
+        :param dim: default 1. 1D conv or 2D conv
+        """
+        super(DepthwiseSeparableConv, self).__init__()
+        self.relu = relu
+        if dim == 1:
+            self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch,
+                                            kernel_size=k, groups=in_ch, padding=k//2)
+            self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch,
+                                            kernel_size=1, padding=0)
+        elif dim == 2:
+            self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch,
+                                            kernel_size=k, groups=in_ch, padding=k//2)
+            self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch,
+                                            kernel_size=1, padding=0)
+        else:
+            raise Exception("Incorrect dimension!")
+
+    def forward(self, x):
+        """
+        :Input: (N, L_in, D)
+        :Output: (N, L_out, D)
+        """
+        x = x.transpose(1, 2)
+        if self.relu:
+            out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True)
+        else:
+            out = self.pointwise_conv(self.depthwise_conv(x))
+        return out.transpose(1, 2)  # (N, L, D)
+
+
+class ConvEncoder(nn.Module):
+    def __init__(self, kernel_size=7, n_filters=128, dropout=0.1):
+        super(ConvEncoder, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(n_filters)
+        self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True)
+
+    def forward(self, x, mask):
+        """
+        :param x: (N, L, D)
+        :param mask: (N, L), is not used.
+        :return: (N, L, D)
+        """
+        return self.layer_norm(self.dropout(self.conv(x)) + x)  # (N, L, D)
+
+
+class TrainablePositionalEncoding(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, max_position_embeddings, hidden_size, dropout=0.1):
+        super(TrainablePositionalEncoding, self).__init__()
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
+        self.LayerNorm = nn.LayerNorm(hidden_size)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, input_feat):
+        """
+        Args:
+            input_feat: (N, L, D)
+        """
+        bsz, seq_length = input_feat.shape[:2]
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device)
+        position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
+
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = self.LayerNorm(input_feat + position_embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class PositionEncoding(nn.Module):
+    """
+    Add positional information to input tensor.
+    :Examples:
+        >>> model = PositionEncoding(n_filters=6, max_len=10)
+        >>> test_input1 = torch.zeros(3, 10, 6)
+        >>> output1 = model(test_input1)
+        >>> output1.size()
+        >>> test_input2 = torch.zeros(5, 3, 9, 6)
+        >>> output2 = model(test_input2)
+        >>> output2.size()
+    """
+
+    def __init__(self, n_filters=128, max_len=500, pe_type="cosine"):
+        """
+        :param n_filters: same with input hidden size
+        :param max_len: maximum sequence length
+        :param pe_type: cosine or linear or None
+        """
+        super(PositionEncoding, self).__init__()
+        self.pe_type = pe_type
+        if pe_type != "none":
+            position = torch.arange(0, max_len).float().unsqueeze(1)
+            if pe_type == "cosine":
+                # Compute the positional encodings once in log space.
+                pe = torch.zeros(max_len, n_filters)  # (L, D)
+                div_term = torch.exp(torch.arange(0, n_filters, 2).float() * - (math.log(10000.0) / n_filters))
+                pe[:, 0::2] = torch.sin(position * div_term)
+                pe[:, 1::2] = torch.cos(position * div_term)
+            elif pe_type == "linear":
+                pe = position / max_len
+            else:
+                raise ValueError
+            self.register_buffer("pe", pe)  # buffer is a tensor, not a variable, (L, D)
+
+    def forward(self, x):
+        """
+        :Input: (*, L, D)
+        :Output: (*, L, D) the same size as input
+        """
+        if self.pe_type != "none":
+            pe = self.pe.data[:x.size(-2), :]  # (#x.size(-2), n_filters)
+            extra_dim = len(x.size()) - 2
+            for _ in range(extra_dim):
+                pe = pe.unsqueeze(0)
+            x = x + pe
+        return x
+
+
+class LinearLayer(nn.Module):
+    """linear layer configurable with layer normalization, dropout, ReLU."""
+
+    def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True):
+        super(LinearLayer, self).__init__()
+        self.relu = relu
+        self.layer_norm = layer_norm
+        if layer_norm:
+            self.LayerNorm = nn.LayerNorm(in_hsz)
+        layers = [
+            nn.Dropout(dropout),
+            nn.Linear(in_hsz, out_hsz)
+        ]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """(N, L, D)"""
+        if self.layer_norm:
+            x = self.LayerNorm(x)
+        x = self.net(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x  # (N, L, D)
+
+
+bert_config = dict(
+    hidden_size=768,
+    intermediate_size=768,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    num_attention_heads=4,
+)
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, use_self_attention=True):
+        super(BertLayer, self).__init__()
+        self.use_self_attention = use_self_attention
+        if use_self_attention:
+            self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask):
+        """
+        Args:
+            hidden_states:  (N, L, D)
+            attention_mask:  (N, L) with 1 indicate valid, 0 indicates invalid
+        Returns:
+
+        """
+        if self.use_self_attention:
+            attention_output = self.attention(hidden_states, attention_mask)
+        else:
+            attention_output = hidden_states
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        """
+        Args:
+            input_tensor: (N, L, D)
+            attention_mask: (N, L)
+        Returns:
+        """
+        self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Sequential(
+            nn.Linear(config.hidden_size, config.intermediate_size),
+            nn.ReLU(True))
+
+    def forward(self, hidden_states):
+        return self.dense(hidden_states)
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  # (N, L, nh, dh)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)  # (N, nh, L, dh)
+
+    def forward(self, query_states, key_states, value_states, attention_mask):
+        """
+        Args:
+            query_states: (N, Lq, D)
+            key_states: (N, L, D)
+            value_states: (N, L, D)
+            attention_mask: (N, Lq, L)
+        Returns:
+        """
+        # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last)
+        # will be ignored in future computation anyway
+        attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000.  # (N, 1, Lq, L)
+        mixed_query_layer = self.query(query_states)
+        mixed_key_layer = self.key(key_states)
+        mixed_value_layer = self.value(value_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)  # (N, nh, Lq, dh)
+        key_layer = self.transpose_for_scores(mixed_key_layer)  # (N, nh, L, dh)
+        value_layer = self.transpose_for_scores(mixed_value_layer)  # (N, nh, L, dh)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))  # (N, nh, Lq, L)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
diff --git a/baselines/excl/optimization.py b/baselines/excl/optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac4c3095b2f07ef688c450d493c889ca459856ad
--- /dev/null
+++ b/baselines/excl/optimization.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+import logging
+import abc
+import sys
+
+logger = logging.getLogger(__name__)
+
+
+if sys.version_info >= (3, 4):
+    ABC = abc.ABC
+else:
+    ABC = abc.ABCMeta('ABC', (), {})
+
+
+class _LRSchedule(ABC):
+    """ Parent of all LRSchedules here. """
+    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        """
+        :param warmup:  what fraction of t_total steps will be used for linear warmup
+        :param t_total: how many training steps (updates) are planned
+        :param kw:
+        """
+        super(_LRSchedule, self).__init__(**kw)
+        if t_total < 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        warmup = max(warmup, 0.)
+        self.warmup, self.t_total = float(warmup), float(t_total)
+        self.warned_for_t_total_at_progress = -1
+
+    def get_lr(self, step, nowarn=False):
+        """
+        :param step:    which of t_total steps we're on
+        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+        :return:        learning rate multiplier for current update
+        """
+        if self.t_total < 0:
+            return 1.
+        progress = float(step) / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning(
+                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+                    .format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+
+    @abc.abstractmethod
+    def get_lr_(self, progress):
+        """
+        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+        :return:            learning rate multiplier for current update
+        """
+        return 1.
+
+
+class ConstantLR(_LRSchedule):
+    def get_lr_(self, progress):
+        return 1.
+
+
+class WarmupCosineSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        """
+        :param warmup:      see LRSchedule
+        :param t_total:     see LRSchedule
+        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+        :param kw:
+        """
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+
+
+class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        assert(cycles >= 1.)
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
+            return ret
+
+
+class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+    """
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        assert(warmup * cycles < 1.)
+        warmup = warmup * cycles if warmup >= 0 else warmup
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+
+    def get_lr_(self, progress):
+        progress = progress * self.cycles % 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * progress))
+            return ret
+
+
+class WarmupConstantSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    """
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+
+
+class WarmupLinearSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    """
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0.)
+
+
+SCHEDULES = {
+    None:       ConstantLR,
+    "none":     ConstantLR,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
+}
+
+
+class EMA(object):
+    """ Exponential Moving Average for model parameters.
+    references:
+    [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py
+    [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py"""
+    def __init__(self, decay):
+        self.decay = decay
+        self.shadow = {}
+        self.original = {}
+
+    def register(self, name, val):
+        self.shadow[name] = val.clone()
+
+    def __call__(self, model, step):
+        decay = min(self.decay,  (1 + step) / (10.0 + step))
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                new_average = \
+                    (1.0 - decay) * param.data + decay * self.shadow[name]
+                self.shadow[name] = new_average.clone()
+
+    def assign(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                self.original[name] = param.data.clone()
+                param.data = self.shadow[name]
+
+    def resume(self, model):
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                assert name in self.shadow
+                param.data = self.original[name]
+
+
+class BertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
+        schedule: schedule to use for the warmup (see above).
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(BertAdam, self).__init__(params, defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                lr.append(lr_scheduled)
+        return lr
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['next_m'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['next_v'] = torch.zeros_like(p.data)
+
+                next_m, next_v = state['next_m'], state['next_v']
+                beta1, beta2 = group['b1'], group['b2']
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                next_m.mul_(beta1).add_(1 - beta1, grad)
+                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                update = next_m / (next_v.sqrt() + group['e'])
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data
+
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+
+                update_with_lr = lr_scheduled * update
+                p.data.add_(-update_with_lr)
+
+                state['step'] += 1
+
+                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+                # No bias correction
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+
+        return loss
diff --git a/baselines/excl/scripts/eval.sh b/baselines/excl/scripts/eval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8b506bb56df447c8c67fa84e6927de4e75f2613e
--- /dev/null
+++ b/baselines/excl/scripts/eval.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/modular_moment_localization/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+eval_split_name=$1
+submission_path=$2
+save_path=$3
+gt_path=data/tvr_${eval_split_name}_release.jsonl
+
+python standalone_eval/eval.py \
+-gt_path ${gt_path} \
+-submission_path ${submission_path} \
+-save_path ${save_path} \
+${@:4}
diff --git a/baselines/excl/scripts/inference.sh b/baselines/excl/scripts/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..db796fbca326987f8cc6bdb5c0d71acb327b38a7
--- /dev/null
+++ b/baselines/excl/scripts/inference.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/excl/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+model_dir=$1
+eval_split_name=$2
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+tasks=()
+tasks+=(VCMR)
+tasks+=(SVMR)
+tasks+=(VR)
+echo "tasks ${tasks[@]}"
+python baselines/excl/inference.py \
+--model_dir ${model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+${@:3}
diff --git a/baselines/excl/scripts/inference_with_vcmr.sh b/baselines/excl/scripts/inference_with_vcmr.sh
new file mode 100644
index 0000000000000000000000000000000000000000..87ac8cd0c5e2313bb8227731ebb174c2c43fe288
--- /dev/null
+++ b/baselines/excl/scripts/inference_with_vcmr.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/excl/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+model_dir=$1
+eval_split_name=$2
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+tasks=()
+tasks+=(VCMR)
+
+project_root=./baselines
+external_model_dir=tvr-video_sub-res-2019_11_06_00_33_39
+external_inference_vr_res_path=${project_root}/mixture_embedding_experts/results/${external_model_dir}/inference_tvr_${eval_split_name}_None_predictions_VR.json
+
+
+echo "tasks ${tasks[@]}"
+python baselines/excl/inference_with_vcmr.py \
+--model_dir ${model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--external_inference_vr_res_path ${external_inference_vr_res_path} \
+--eval_id ${external_model_dir} \
+--eval_path ${eval_path} \
+${@:3}
diff --git a/baselines/excl/scripts/train.sh b/baselines/excl/scripts/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ed851ea8d3a7a34a058898ffced532e36dd7cfcc
--- /dev/null
+++ b/baselines/excl/scripts/train.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/excl/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS
+# use --eval_tasks_at_training ["VR", "SVMR", "VCMR"] --stop_task ["VR", "SVMR", "VCMR"] for
+# use --lw_neg_q 0 --lw_neg_ctx 0 for training SVMR/SVMR only
+# use --lw_st_ed 0 for training with VR only
+dset_name=$1  # see case below
+ctx_mode=$2  # ["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"]
+vid_feat_type=$3  # [resnet, i3d, resnet_i3d]
+feature_root=data/tvr_feature_release
+results_root=baselines/excl/results
+vid_feat_size=2048
+extra_args=()
+
+if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+    if [[ ${dset_name} != "tvr" ]]; then
+        echo "The use of subtitles is only supported in tvr."
+        exit 1
+    fi
+fi
+
+
+case ${dset_name} in
+    tvr)
+        train_path=data/tvr_train_release.jsonl
+        corpus_path=data/tvr_video2dur_idx.json
+        desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5
+        if [[ ${vid_feat_type} == "i3d" ]]; then
+            echo "Using I3D feature with shape 1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5
+            vid_feat_size=1024
+        elif [[ ${vid_feat_type} == "resnet" ]]; then
+            echo "Using ResNet feature with shape 2048"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+            vid_feat_size=2048
+        elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then
+            echo "Using concatenated ResNet and I3D feature with shape 2048+1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5
+            vid_feat_size=3072
+            extra_args+=(--no_norm_vfeat)  # since they are already normalized.
+        fi
+        eval_split_name=val
+        nms_thd=-1
+        extra_args+=(--eval_path)
+        extra_args+=(data/tvr_val_release.jsonl)
+        clip_length=1.5
+        extra_args+=(--max_ctx_l)
+        extra_args+=(100)  # max_ctx_l = 100 for clip_length = 1.5, only ~109/21825 has more than 100.
+        extra_args+=(--max_pred_l)
+        extra_args+=(16)
+        if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+            echo "Running with sub."
+            desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5  # overwrite
+            sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5
+            sub_feat_size=768
+            extra_args+=(--sub_feat_size)
+            extra_args+=(${sub_feat_size})
+            extra_args+=(--sub_bert_path)
+            extra_args+=(${sub_bert_path})
+        fi
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+
+echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]"
+echo "Extra args ${extra_args[@]}"
+python baselines/excl/train.py \
+--dset_name=${dset_name} \
+--eval_split_name=${eval_split_name} \
+--nms_thd=${nms_thd} \
+--results_root=${results_root} \
+--train_path=${train_path} \
+--desc_bert_path=${desc_bert_path} \
+--corpus_path=${corpus_path} \
+--vid_feat_path=${vid_feat_path} \
+--clip_length=${clip_length} \
+--vid_feat_size=${vid_feat_size} \
+--ctx_mode=${ctx_mode} \
+${extra_args[@]} \
+${@:4}
diff --git a/baselines/excl/start_end_dataset.py b/baselines/excl/start_end_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..158c0b6144bf0ec14de46daad47c23a5623015f1
--- /dev/null
+++ b/baselines/excl/start_end_dataset.py
@@ -0,0 +1,380 @@
+"""
+Dataset for clip model
+"""
+import logging
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import h5py
+import time
+import math
+import random
+from tqdm import tqdm
+from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array, flat_list_of_lists, merge_dicts
+from utils.tensor_utils import pad_sequences_1d
+from baselines.clip_alignment_with_language.local_utils.compute_proposal_upper_bound import \
+    get_didemo_agreed_ts
+
+logger = logging.getLogger(__name__)
+
+
+class ExCLDataset(Dataset):
+    """
+    Args:
+        dset_name, str, ["tvr"]
+        ctx_mode: str,
+    Return:
+        a dict: {
+            "meta": {
+                "query_id": int,
+                "desc": str,
+                "vid_name": str,
+                "duration": float,
+                "ts": [st (float), ed (float)], seconds, ground_truth timestamps
+            }
+            "model_inputs": {
+                "query_feat": torch.tensor, (L, D_q)
+                "video_feat": torch.tensor, (n_clip_in_moment, D_video)
+                "sub_feat": torch.tensor, (n_clip_in_moment, D_sub)
+                "st_ed_indices": torch.LongTensor, (2, )
+            }
+        }
+    """
+    def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler,
+                 max_desc_len, max_ctx_len,
+                 vid_feat_path_or_handler, clip_length, ctx_mode="video",
+                 normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0,
+                 corpus_path=None, eval_split_name=None):
+        self.dset_name = dset_name
+        self.data_path = data_path
+        self.data_ratio = data_ratio
+
+        self.desc_bert_path_or_handler = desc_bert_path_or_handler
+        self.max_desc_len = max_desc_len
+
+        self.sub_bert_path_or_handler = sub_bert_path_or_handler
+        self.max_ctx_len = max_ctx_len
+        self.vid_feat_path_or_handler = vid_feat_path_or_handler
+        self.clip_length = clip_length
+        self.ctx_mode = ctx_mode
+
+        # prepare desc data
+        self.data = load_jsonl(data_path)
+        if self.data_ratio != 1:
+            n_examples = int(len(self.data) * data_ratio)
+            self.data = self.data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+
+        if corpus_path is not None:
+            video_data = load_json(corpus_path)[eval_split_name]
+            self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()]
+            self.video2idx = {k: v[1] for k, v in video_data.items()}
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        raw_data = self.data[index]
+
+        # initialize with basic data
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+            ts=raw_data["ts"] if self.dset_name != "didemo" else get_didemo_agreed_ts(raw_data["ts"]),
+        )
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+
+        ctx_l = 0
+        if self.use_video:
+            video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clip, D)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+            ctx_l = len(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clips, D_t)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+            ctx_l = len(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+
+        if self.use_tef:
+            # note the tef features here are normalized clip indices (1.5 secs), instead of the original time (1 sec)
+            ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
+            tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
+            tef_ed = tef_st + 1.0 / ctx_l
+            tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
+            model_inputs["tef_feat"] = tef
+        else:
+            model_inputs["tef_feat"] = torch.zeros((2, 2))
+
+        if self.use_video and self.use_tef:
+            model_inputs["video_feat"] = torch.cat(
+                [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D+2)
+        if self.use_sub and self.use_tef:
+            model_inputs["sub_feat"] = torch.cat(
+                [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D_t+2)
+
+        model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], max_idx=ctx_l-1)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+
+        Returns:
+            [st_idx, ed_idx]: int,
+
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        # TODO which one is better, [2: 5] or [2: 6)
+        """
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx)
+        return torch.LongTensor([st_idx, ed_idx])
+
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+
+
+class ExCLEvalDataset(Dataset):
+    """
+    init_data_mode: `video_query` or `video_only` or `query_only`,
+        it indicates which data to load when initialize the Dataset object.
+    data_mode: `context` or `query`, it indicates which data to return for self.__get_item__()
+    desc_bert_path_or_handler: h5py.File object or str path
+    vid_feat_path_or_handler: h5py.File object or str path
+    eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with
+        max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals.
+    load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval.
+    data_ratio: percentage of query data to use.
+    """
+    def __init__(self, dset_name, eval_split_name, data_path=None,
+                 desc_bert_path_or_handler=None, max_desc_len=None,  max_ctx_len=None,
+                 sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
+                 corpus_path=None, clip_length=None,
+                 ctx_mode="video", data_mode="context",
+                 h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True):
+        self.dset_name = dset_name
+        self.eval_split_name = eval_split_name
+        self.ctx_mode = ctx_mode
+        self.load_gt_video = False
+        self.data_ratio = data_ratio  # only affect query data
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+
+        self.data_mode = None
+        self.set_data_mode(data_mode)
+
+        self.max_desc_len = max_desc_len
+        self.max_ctx_len = max_ctx_len
+        self.data_path = data_path
+        self.query_data = load_jsonl(data_path)
+        if data_ratio != 1:
+            n_examples = int(len(self.query_data) * data_ratio)
+            self.query_data = self.query_data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+
+        video_data = load_json(corpus_path)[self.eval_split_name]
+        self.video_data = {k: v[0] for k, v in video_data.items()}
+        self.video2idx = {k: v[1] for k, v in video_data.items()}
+        self.clip_length = clip_length
+
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+
+    def set_data_mode(self, data_mode):
+        """context or query"""
+        assert data_mode in ["context", "query"]
+        self.data_mode = data_mode
+
+    def load_gt_vid_name_for_query(self, load_gt_video):
+        """load_gt_video: bool, affect the returned value of self._get_item_query"""
+        assert "vid_name" in self.query_data[0]
+        self.load_gt_video = load_gt_video
+
+    def __len__(self):
+        if self.data_mode == "context":
+            return len(self.video_data)
+        else:
+            return len(self.query_data)
+
+    def __getitem__(self, index):
+        return self._get_item_query(index)
+
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+
+    def _get_item_query(self, index):
+        """Need to batch"""
+        raw_data = self.query_data[index]
+
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"] if self.load_gt_video else None
+        )
+
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+
+        Returns:
+            [st_idx, ed_idx]: int,
+
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        Given ts = [5, 9], st_idx = 3, ed_idx = 6,
+        clips should be indexed as [3: 6), the translated back ts should be [4.5:9].
+        # TODO which one is better, [2: 5] or [2: 6)
+        """
+        # TODO ed_idx -= 1, should also modify relevant code in inference.py
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length) - 1, max_idx)  # st_idx could be the same as ed_idx
+        return torch.LongTensor([st_idx, ed_idx])
+
+    def get_batched_context(self, vid_names):
+        batch = [self._get_item_context_by_vid_name(e) for e in vid_names]
+        metas, model_inputs = start_end_collate(batch)
+        return metas, model_inputs
+
+    def _get_item_context_by_vid_name(self, vid_name):
+        """No need to batch, since it has already been batched here"""
+        # initialize with basic data
+        meta = dict(
+            vid_name=vid_name,
+            duration=self.video_data[vid_name],
+        )
+
+        model_inputs = dict()
+        ctx_l = 0
+
+        if self.use_video:
+            video_feat = self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clip, D)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+            ctx_l = len(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros((2, 2))
+
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len]  # (N_clips, D_t)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+            ctx_l = len(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros((2, 2))
+
+        if self.use_tef:
+            ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
+            tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
+            tef_ed = tef_st + 1.0 / ctx_l
+            tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
+            model_inputs["tef_feat"] = tef
+        else:
+            model_inputs["tef_feat"] = torch.zeros((2, 2))
+
+        if self.use_video and self.use_tef:
+            model_inputs["video_feat"] = torch.cat(
+                [model_inputs["video_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D+2)
+        if self.use_sub and self.use_tef:
+            model_inputs["sub_feat"] = torch.cat(
+                [model_inputs["sub_feat"], model_inputs["tef_feat"]], dim=1)  # (N_clips, D_t+2)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+
+def start_end_collate(batch):
+    batch_meta = [e["meta"] for e in batch]  # seems no need to collate ?
+
+    model_inputs_keys = batch[0]["model_inputs"].keys()
+    batched_data = dict()
+    for k in model_inputs_keys:
+        if "feat" in k:
+            batched_data[k] = pad_sequences_1d(
+                [e["model_inputs"][k] for e in batch], dtype=torch.float32, fixed_length=None)
+
+    if "st_ed_indices" in model_inputs_keys:
+        batched_data["st_ed_indices"] = torch.stack(
+            [e["model_inputs"]["st_ed_indices"] for e in batch], dim=0)
+    return batch_meta, batched_data
+
+
+def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False):
+    model_inputs = {}
+    for k, v in batched_model_inputs.items():
+        if "feat" in k:
+            model_inputs[k] = v[0].to(device, non_blocking=non_blocking)
+            model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking)
+        else:
+            model_inputs[k] = v.to(device, non_blocking=non_blocking)
+    return model_inputs
+
+
+if __name__ == '__main__':
+    from baselines.crossmodal_moment_localization.config import BaseOptions
+    options = BaseOptions().parse()
diff --git a/baselines/excl/train.py b/baselines/excl/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac40adae00163564842d774c32b3884a5e16e7df
--- /dev/null
+++ b/baselines/excl/train.py
@@ -0,0 +1,305 @@
+import os
+import time
+import json
+import pprint
+import random
+import numpy as np
+from easydict import EasyDict as EDict
+from tqdm import tqdm, trange
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+from baselines.excl.config import BaseOptions
+from baselines.excl.model import EXCL
+from baselines.excl.start_end_dataset import \
+    ExCLDataset, start_end_collate, prepare_batch_inputs
+from baselines.excl.inference import eval_epoch, start_inference
+from utils.basic_utils import AverageMeter
+from utils.model_utils import count_parameters
+
+
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def set_seed(seed, use_cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if use_cuda:
+        torch.cuda.manual_seed_all(seed)
+
+
+def train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True):
+    logger.info("use train_epoch func for training: {}".format(training))
+    model.train(mode=training)
+
+    # init meters
+    dataloading_time = AverageMeter()
+    prepare_inputs_time = AverageMeter()
+    model_forward_time = AverageMeter()
+    model_backward_time = AverageMeter()
+    loss_meters = OrderedDict(loss_st_ed=AverageMeter())
+
+    num_training_examples = len(train_loader)
+    timer_dataloading = time.time()
+    for batch_idx, batch in tqdm(enumerate(train_loader),
+                                 desc="Training Iteration",
+                                 total=num_training_examples):
+        global_step = epoch_i * num_training_examples + batch_idx
+        dataloading_time.update(time.time() - timer_dataloading)
+
+        # continue
+        timer_start = time.time()
+        model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory)
+        prepare_inputs_time.update(time.time() - timer_start)
+        # logger.info("model_inputs {}"
+        #             .format({k: (type(k), v.shape if isinstance(v, torch.Tensor) else v)
+        #                      for k, v in model_inputs.items()}))
+        # logger.info("model_inputs \n{}".format({k: (type(v), v.shape, v.dtype) for k, v in model_inputs.items()}))
+        timer_start = time.time()
+        loss, loss_dict, _, _ = model(**model_inputs)
+        model_forward_time.update(time.time() - timer_start)
+        timer_start = time.time()
+        if training:
+            optimizer.zero_grad()
+            loss.backward()
+            if opt.grad_clip != -1:
+                nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
+            optimizer.step()
+            model_backward_time.update(time.time() - timer_start)
+
+            opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step)
+            for k, v in loss_dict.items():
+                opt.writer.add_scalar("Train/{}".format(k), v, global_step)
+
+        for k, v in loss_dict.items():
+            loss_meters[k].update(float(v))
+
+        timer_dataloading = time.time()
+        if opt.debug and batch_idx == 3:
+            break
+
+    if training:
+        to_write = opt.train_log_txt_formatter.format(
+            time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+            epoch=epoch_i,
+            loss_str=" ".join(["{} {:.4f}".format(k, v.avg) for k, v in loss_meters.items()]))
+        with open(opt.train_log_filepath, "a") as f:
+            f.write(to_write)
+        print("Epoch time stats:")
+        print("dataloading_time: max {dataloading_time.max} "
+              "min {dataloading_time.min} avg {dataloading_time.avg}\n"
+              "prepare_inputs_time: max {prepare_inputs_time.max} "
+              "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n"
+              "model_forward_time: max {model_forward_time.max} "
+              "min {model_forward_time.min} avg {model_forward_time.avg}\n"
+              "model_backward_time: max {model_backward_time.max} "
+              "min {model_backward_time.min} avg {model_backward_time.avg}\n"
+              "".format(dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time,
+                        model_forward_time=model_forward_time, model_backward_time=model_backward_time))
+    else:
+        for k, v in loss_meters.items():
+            opt.writer.add_scalar("Eval_Loss/{}".format(k), v.avg, epoch_i)
+
+
+def rm_key_from_odict(odict_obj, rm_suffix):
+    """remove key entry from the OrderedDict"""
+    return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k])
+
+
+def train(model, train_dataset, val_dataset, opt):
+    # Prepare optimizer
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+
+    train_loader = DataLoader(train_dataset,
+                              collate_fn=start_end_collate,
+                              batch_size=opt.bsz,
+                              num_workers=opt.num_workers,
+                              shuffle=True,
+                              pin_memory=opt.pin_memory)
+
+    # Prepare optimizer
+    optimizer = torch.optim.Adam(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=opt.lr)
+
+    prev_best_score = 0.
+    es_cnt = 0
+    start_epoch = -1 if opt.eval_untrained else 0
+    eval_tasks_at_training = opt.eval_tasks_at_training  # VR is computed along with VCMR
+    save_submission_filename = \
+        "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, "_".join(eval_tasks_at_training))
+    for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"):
+        if epoch_i > -1:
+            with torch.autograd.detect_anomaly():
+                train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True)
+        global_step = (epoch_i + 1) * len(train_loader)
+        if opt.eval_path is not None:
+            with torch.no_grad():
+                metrics_no_nms, metrics_nms, latest_file_paths = \
+                    eval_epoch(model, val_dataset, opt, save_submission_filename,
+                               tasks=eval_tasks_at_training, max_after_nms=100)
+            to_write = opt.eval_log_txt_formatter.format(
+                time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+                epoch=epoch_i,
+                eval_metrics_str=json.dumps(metrics_no_nms))
+            with open(opt.eval_log_filepath, "a") as f:
+                f.write(to_write)
+            logger.info("metrics_no_nms {}".format(pprint.pformat(rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4)))
+            logger.info("metrics_nms {}".format(pprint.pformat(metrics_nms, indent=4)))
+
+            # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms
+            metrics = metrics_no_nms
+            # early stop/ log / save model
+            for task_type in ["SVMR", "VCMR"]:
+                if task_type in metrics:
+                    task_metrics = metrics[task_type]
+                    for iou_thd in [0.5, 0.7]:
+                        opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd),
+                                               {k: v for k, v in task_metrics.items() if str(iou_thd) in k},
+                                               global_step)
+
+            task_type = "VR"
+            if task_type in metrics:
+                task_metrics = metrics[task_type]
+                opt.writer.add_scalars("Eval/{}".format(task_type),
+                                       {k: v for k, v in task_metrics.items()},
+                                       global_step)
+
+            # use the most strict metric available
+            stop_metric_names = ["r1"] if opt.stop_task == "VR" else ["0.5-r1", "0.7-r1"]
+            stop_score = sum([metrics[opt.stop_task][e] for e in stop_metric_names])
+
+            if stop_score > prev_best_score:
+                es_cnt = 0
+                prev_best_score = stop_score
+
+                checkpoint = {
+                    "model": model.state_dict(),
+                    "model_cfg": model.config,
+                    "epoch": epoch_i}
+                torch.save(checkpoint, opt.ckpt_filepath)
+
+                best_file_paths = [e.replace("latest", "best") for e in latest_file_paths]
+                for src, tgt in zip(latest_file_paths, best_file_paths):
+                    os.renames(src, tgt)
+                logger.info("The checkpoint file has been updated.")
+            else:
+                es_cnt += 1
+                if opt.max_es_cnt != -1 and es_cnt > opt.max_es_cnt:  # early stop
+                    with open(opt.train_log_filepath, "a") as f:
+                        f.write("Early Stop at epoch {}".format(epoch_i))
+                    logger.info("Early stop at {} with {} {}"
+                                .format(epoch_i, " ".join([opt.stop_task] + stop_metric_names), prev_best_score))
+                    break
+        else:
+            checkpoint = {
+                "model": model.state_dict(),
+                "model_cfg": model.config,
+                "epoch": epoch_i}
+            torch.save(checkpoint, opt.ckpt_filepath)
+
+        if opt.debug:
+            break
+
+    opt.writer.close()
+
+
+def start_training():
+    logger.info("Setup config, data and model...")
+    opt = BaseOptions().parse()
+    set_seed(opt.seed)
+    if opt.debug:  # keep the model run deterministically
+        # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
+        # Enable this only when input size is fixed.
+        cudnn.benchmark = False
+        cudnn.deterministic = True
+
+    opt.writer = SummaryWriter(opt.tensorboard_log_dir)
+    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
+    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"
+
+    train_dataset = ExCLDataset(
+        dset_name=opt.dset_name,
+        data_path=opt.train_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        clip_length=opt.clip_length,
+        ctx_mode=opt.ctx_mode,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+    )
+
+    if opt.eval_path is not None:
+        eval_dataset = ExCLDataset(
+            dset_name=opt.dset_name,
+            data_path=opt.eval_path,
+            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
+            sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
+            max_desc_len=opt.max_desc_l,
+            max_ctx_len=opt.max_ctx_l,
+            vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
+            clip_length=opt.clip_length,
+            ctx_mode=opt.ctx_mode,
+            h5driver=opt.h5driver,
+            data_ratio=opt.data_ratio,
+            normalize_vfeat=not opt.no_norm_vfeat,
+            normalize_tfeat=not opt.no_norm_tfeat,
+            corpus_path=opt.corpus_path,
+            eval_split_name=opt.eval_split_name
+        )
+    else:
+        eval_dataset = None
+
+    model_config = EDict(
+        visual_input_size=opt.vid_feat_size,
+        sub_input_size=opt.sub_feat_size,  # for both desc and subtitles
+        query_input_size=opt.q_feat_size,  # for both desc and subtitles
+        hidden_size=opt.hidden_size,
+        drop=opt.drop,
+        ctx_mode=opt.ctx_mode,  # video, sub or video_sub
+        initializer_range=opt.initializer_range
+    )
+    logger.info("model_config {}".format(model_config))
+    model = EXCL(model_config)
+    count_parameters(model)
+    logger.info("Start Training...")
+    train(model, train_dataset, eval_dataset, opt)
+    return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
+
+
+if __name__ == '__main__':
+    model_dir, eval_split_name, eval_path, debug = start_training()
+    if not debug:
+        model_dir = model_dir.split(os.sep)[-1]
+        tasks = ["SVMR"]
+        input_args = ["--model_dir", model_dir,
+                      "--eval_split_name", eval_split_name,
+                      "--eval_path", eval_path,
+                      "--tasks"] + tasks
+
+        import sys
+        sys.argv[1:] = input_args
+        logger.info("\n\n\nFINISHED TRAINING!!!")
+        logger.info("Evaluating model in {}".format(model_dir))
+        logger.info("Input args {}".format(sys.argv[1:]))
+        start_inference()
diff --git a/baselines/mixture_embedding_experts/README.md b/baselines/mixture_embedding_experts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5aa7a03f342e5f60876f416e281043c0044ff110
--- /dev/null
+++ b/baselines/mixture_embedding_experts/README.md
@@ -0,0 +1,14 @@
+#  Mixture Embedding Experts (MEE)
+
+This folder contains the model described in the paper
+```
+@article{miech18learning,
+  title={Learning a {T}ext-{V}ideo {E}mbedding from {I}ncomplete and {H}eterogeneous {D}ata},
+  author={Miech, Antoine and Laptev, Ivan and Sivic, Josef},
+  journal={arXiv:1804.02516},
+  year={2018},
+}
+```
+
+Disclaimer: This code is implemented by [Jie Lei](http://www.cs.unc.edu/~jielei/) for the TVR dataset, 
+it does not guarantee the reproducibility of the original authors' results.
diff --git a/baselines/mixture_embedding_experts/__init__.py b/baselines/mixture_embedding_experts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/baselines/mixture_embedding_experts/config.py b/baselines/mixture_embedding_experts/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..93d96d75651f79ca40b64fa5130ba88d5cfe2455
--- /dev/null
+++ b/baselines/mixture_embedding_experts/config.py
@@ -0,0 +1,164 @@
+import os
+import time
+import torch
+import argparse
+
+from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
+
+
+class BaseOptions(object):
+    saved_option_filename = "opt.json"
+    ckpt_filename = "model.ckpt"
+    tensorboard_log_dir = "tensorboard_log"
+    train_log_filename = "train.log.txt"
+    eval_log_filename = "eval.log.txt"
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        self.initialized = False
+        self.opt = None
+
+    def initialize(self):
+        self.initialized = True
+        self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
+        self.parser.add_argument("--eval_split_name", type=str, default="val",
+                                 help="should match keys in corpus_path, must set for VCMR")
+        self.parser.add_argument("--debug", action="store_true",
+                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
+        self.parser.add_argument("--data_ratio", type=float, default=1.0,
+                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
+                                      "Use small portion for debug purposes. Note this is different from --debug, "
+                                      "which works by breaking the loops, typically they are not used together.")
+        self.parser.add_argument("--results_root", type=str, default="results")
+        self.parser.add_argument("--exp_id", type=str, default="res", help="id of the current run")
+        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
+        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
+        self.parser.add_argument("--num_workers", type=int, default=8,
+                                 help="num subprocesses used to load the data, 0: use main process")
+        self.parser.add_argument("--no_core_driver", action="store_true",
+                                 help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
+        self.parser.add_argument("--no_pin_memory", action="store_true",
+                                 help="Don't use pin_memory=True for dataloader. "
+                                      "ref: https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4")
+
+        # training config
+        self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+        self.parser.add_argument("--wd", type=float, default=0, help="weight decay")
+        self.parser.add_argument("--n_epoch", type=int, default=50, help="number of epochs to run")
+        self.parser.add_argument("--max_es_cnt", type=int, default=10, help="number of epochs to early stop")
+        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+        self.parser.add_argument("--eval_query_bsz", type=int, default=1000,
+                                 help="mini-batch size at inference, for query")
+        self.parser.add_argument("--eval_ctx_bsz", type=int, default=200,
+                                 help="mini-batch size at inference, for proposals")
+        self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
+        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
+        self.parser.add_argument("--margin", type=float, default=0.2, help="margin for hinge loss")
+
+        # Model and Data config
+        self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
+        self.parser.add_argument("--max_ctx_l", type=int, default=100,
+                                 help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
+
+        self.parser.add_argument("--train_path", type=str, default=None)
+        self.parser.add_argument("--eval_path", type=str, default=None,
+                                 help="Evaluating during training, for Dev set. If None, will only do training, "
+                                      "anet_cap and charades_sta has no dev set, so None")
+        self.parser.add_argument("--desc_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_bert_path", type=str, default=None)
+        self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
+        self.parser.add_argument("--desc_feat_size", type=int, default=768)
+        self.parser.add_argument("--ctx_mode", type=str,
+                                 choices=["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"],
+                                 help="which context to use. a combination of [video, sub, tef]")
+        self.parser.add_argument("--vid_feat_path", type=str, default="")
+        self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
+        self.parser.add_argument("--corpus_path", type=str, default=None)
+        self.parser.add_argument("--no_norm_vfeat", action="store_true",
+                                 help="Do not do normalization on video feat, use it when using i3d_resnet concat feat")
+        self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
+
+        self.parser.add_argument("--output_size", type=int, default=256)
+
+    def display_save(self, opt):
+        args = vars(opt)
+        # Display settings
+        print("------------ Options -------------\n{}\n-------------------"
+              .format({str(k): str(v) for k, v in sorted(args.items())}))
+
+        # Save settings
+        if not isinstance(self, TestOptions):
+            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
+            save_json(args, option_file_path, save_pretty=True)
+
+    def parse(self):
+        if not self.initialized:
+            self.initialize()
+        opt = self.parser.parse_args()
+
+        if opt.debug:
+            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
+            opt.no_core_driver = True
+            opt.num_workers = 0
+
+        if isinstance(self, TestOptions):
+            # modify model_dir to absolute path
+            opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
+            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
+            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
+                if arg not in ["results_root", "num_workers", "nms_thd", "debug",
+                               "eval_split_name", "eval_path", "eval_query_bsz", "eval_ctx_bsz"]:
+                    setattr(opt, arg, saved_options[arg])
+            # opt.no_core_driver = True
+        else:
+            if opt.exp_id is None:
+                raise ValueError("--exp_id is required for at a training option!")
+
+            opt.results_dir = os.path.join(opt.results_root,
+                                           "-".join([opt.dset_name, opt.ctx_mode, opt.exp_id,
+                                                     time.strftime("%Y_%m_%d_%H_%M_%S")]))
+            mkdirp(opt.results_dir)
+            # save a copy of current code
+            code_dir = os.path.dirname(os.path.realpath(__file__))
+            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
+            make_zipfile(code_dir, code_zip_filename,
+                         enclosing_dir="code",
+                         exclude_dirs_substring="results",
+                         exclude_dirs=["results", "debug_results", "__pycache__"],
+                         exclude_extensions=[".pyc", ".ipynb", ".swap"])
+
+        self.display_save(opt)
+
+        if "sub" in opt.ctx_mode:
+            assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
+
+        if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
+            assert opt.no_norm_vfeat
+
+        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
+        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
+        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
+        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
+        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
+        opt.h5driver = None if opt.no_core_driver else "core"
+        # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
+        opt.pin_memory = not opt.no_pin_memory
+        opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
+        self.opt = opt
+        return opt
+
+
+class TestOptions(BaseOptions):
+    """add additional options for evaluating"""
+    def initialize(self):
+        BaseOptions.initialize(self)
+        # also need to specify --eval_split_name
+        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
+        self.parser.add_argument("--model_dir", type=str,
+                                 help="dir contains the model file, will be converted to absolute path afterwards")
+        self.parser.add_argument("--tasks", type=str, nargs="+", choices=["VCMR", "SVMR", "VR"], default="SVMR",
+                                 help="Which tasks to run."
+                                      "VCMR: Video Corpus Moment Retrieval;"
+                                      "SVMR: Single Video Moment Retrieval;"
+                                      "VR: regular Video Retrieval.")
diff --git a/baselines/mixture_embedding_experts/inference.py b/baselines/mixture_embedding_experts/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..b65d36bfa45dbc8400794832a9a8ce501a296f65
--- /dev/null
+++ b/baselines/mixture_embedding_experts/inference.py
@@ -0,0 +1,234 @@
+import os
+import pprint
+import time
+from tqdm import tqdm, trange
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+
+from baselines.mixture_embedding_experts.config import TestOptions
+from baselines.mixture_embedding_experts.model import MEE
+from baselines.mixture_embedding_experts.retrieval_dataset import \
+    retrieval_collate, RetrievalEvalDataset, prepare_batch_inputs
+from utils.basic_utils import save_json
+from standalone_eval.eval import eval_retrieval
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def compute_context_embeddings(model, eval_dataset, opt):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated 1000 (videos) * 300 (proposals) * 20 (clips) * 100 (hsz) * 4 / (1024 ** 3) = 2.24 GB
+    """
+    model.eval()
+    eval_dataset.set_data_mode("context")
+    context_eval_loader = DataLoader(eval_dataset,
+                                     collate_fn=retrieval_collate,
+                                     batch_size=opt.eval_ctx_bsz,
+                                     num_workers=opt.num_workers,
+                                     shuffle=False,
+                                     pin_memory=opt.pin_memory)
+    n_videos = len(eval_dataset)
+    eval_ctx_bsz = opt.eval_ctx_bsz
+    global_meta_list = []  # list(dicts)
+    global_video_embedding, global_sub_embedding = None, None
+    if model.use_video:
+        global_video_embedding = torch.empty((n_videos, model.config.output_size),
+                                             dtype=torch.float32, device=opt.device)  # (N_q, D_o)
+    if model.use_sub:
+        global_sub_embedding = torch.empty((n_videos, model.config.output_size),
+                                           dtype=torch.float32, device=opt.device)  # (N_q, D_o)
+    for idx, batch in tqdm(enumerate(context_eval_loader),
+                           desc="Computing context embedding for videos",
+                           total=len(context_eval_loader)):
+        global_meta_list.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        encoded_video, encoded_sub = model.encode_context(model_inputs["video_feat"], model_inputs["sub_feat"])
+        if model.use_video:
+            global_video_embedding[idx * eval_ctx_bsz: (idx + 1) * eval_ctx_bsz] = encoded_video
+        if model.use_sub:
+            global_sub_embedding[idx * eval_ctx_bsz: (idx + 1) * eval_ctx_bsz] = encoded_sub
+
+        if opt.debug and idx == 100:
+            break
+    return dict(video_meta=global_meta_list,
+                encoded_video=global_video_embedding,
+                encoded_sub=global_sub_embedding)
+
+
+def compute_query2ctx_scores(model, eval_dataset, opt, max_n_videos=100):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+    estimated size 20,000 (query) * 100 (hsz) * 4 / (1024**2) = 7.63 MB
+    """
+    ctx_info = compute_context_embeddings(model, eval_dataset, opt)
+
+    model.eval()
+    eval_dataset.set_data_mode("query")
+    query_eval_loader = DataLoader(eval_dataset,
+                                   collate_fn=retrieval_collate,
+                                   batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers,
+                                   shuffle=False,
+                                   pin_memory=opt.pin_memory)
+    global_meta_list = []  # list(dicts)
+    eval_query_bsz = opt.eval_query_bsz
+    n_query = eval_query_bsz if opt.debug else len(eval_dataset)
+    all_scores = torch.empty((n_query, max_n_videos), dtype=torch.float32)  # (N_q, max_n_videos)
+    all_indices = torch.empty((n_query, max_n_videos), dtype=torch.long)  # (N_q, max_n_videos)
+    for idx, batch in tqdm(enumerate(query_eval_loader),
+                           desc="Computing q embedding",
+                           total=len(query_eval_loader)):
+        global_meta_list.extend(batch[0])
+        model_inputs = prepare_batch_inputs(batch[1], device=opt.device, non_blocking=opt.pin_memory)
+        pooled_query = model.query_pooling(model_inputs["query_feat"])  # (Nq, Dt)
+        conf_matrix = model.get_score_from_pooled_query_with_encoded_ctx(
+            pooled_query, ctx_info["encoded_video"], ctx_info["encoded_sub"])  # (Nq, Nc)
+        sorted_values, sorted_indices = \
+            torch.topk(conf_matrix, max_n_videos, dim=1, largest=True)  # (Nq, max_n_videos)
+        all_scores[idx * eval_query_bsz: (idx + 1) * eval_query_bsz] = sorted_values.cpu()
+        all_indices[idx * eval_query_bsz: (idx + 1) * eval_query_bsz] = sorted_indices.cpu()
+        if opt.debug:
+            break
+    return dict(
+        video_meta=ctx_info["video_meta"],
+        query_meta=global_meta_list,
+        q2ctx_scores=all_scores,
+        q2ctx_indices=all_indices,
+        video2idx=eval_dataset.video2idx
+    )
+
+
+def generate_vr_predictions_from_res(eval_res):
+    video_meta = eval_res["video_meta"]  # list, (Nc, )
+    query_meta = eval_res["query_meta"]  # list, (Nq, )
+    video2idx = eval_res["video2idx"]
+    q2ctx_scores = eval_res["q2ctx_scores"]  # (Nq, max_n_videos)
+    q2ctx_indices = eval_res["q2ctx_indices"]  # (Nq, max_n_videos)
+
+    vr_res = []
+    for i, (scores_row, indices_row) in tqdm(enumerate(zip(q2ctx_scores, q2ctx_indices)),
+                                             desc="[VR] Loop over queries to generate predictions",
+                                             total=len(query_meta)):
+        cur_vr_redictions = []
+        for j, (v_score, v_meta_idx) in enumerate(zip(scores_row, indices_row)):
+            video_idx = video2idx[video_meta[v_meta_idx]["vid_name"]]
+            cur_vr_redictions.append([video_idx, 0, 0, float(v_score)])
+        cur_query_pred = dict(
+            query_id=query_meta[i]["query_id"],
+            desc=query_meta[i]["desc"],
+            predictions=cur_vr_redictions
+        )
+        vr_res.append(cur_query_pred)
+    return vr_res
+
+
+def get_submission_top_n(submission, top_n=100):
+    def get_prediction_top_n(list_dict_predictions, top_n):
+        top_n_res = []
+        for e in list_dict_predictions:
+            e["predictions"] = e["predictions"][:top_n]
+            top_n_res.append(e)
+        return top_n_res
+
+    top_n_submission = dict(video2idx=submission["video2idx"], )
+    for k in submission:
+        if k != "video2idx":
+            top_n_submission[k] = get_prediction_top_n(submission[k], top_n)
+    return top_n_submission
+
+
+def eval_epoch(model, eval_dataset, opt, save_submission_filename,
+               tasks=("SVMR",), max_before_nms=1000, max_after_nms=100):
+    model.eval()
+    logger.info("Computing scores")
+    logger.info("Start timing")
+    # times = []
+    # for _ in range(3):
+    #     st_time = time.time()
+    eval_res = compute_query2ctx_scores(model, eval_dataset, opt)
+    logger.info("Generating predictions from scores")
+    eval_submission_raw = dict(video2idx=eval_res["video2idx"])
+    eval_submission_raw["VR"] = generate_vr_predictions_from_res(eval_res)
+        # times += [time.time() - st_time]
+    # times = torch.FloatTensor(times)
+    IOU_THDS = (0.5, 0.7)
+
+    logger.info("Saving/Evaluating before nms results")
+    submission_path = os.path.join(opt.results_dir, save_submission_filename)
+    eval_submission = get_submission_top_n(eval_submission_raw, top_n=100)
+    save_json(eval_submission, submission_path)
+
+    metrics = eval_retrieval(eval_submission, eval_dataset.query_data,
+                             iou_thds=IOU_THDS, match_number=not opt.debug, verbose=opt.debug)
+    # metrics["time_avg"] = float(times.mean())
+    # metrics["time_std"] = float(times.std())
+    save_metrics_path = submission_path.replace(".json", "_metrics.json")
+    save_json(metrics, save_metrics_path, save_pretty=True, sort_keys=False)
+    latest_file_paths = [submission_path, save_metrics_path]
+
+    metrics_nms = None
+    return metrics, metrics_nms, latest_file_paths
+
+
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    model = MEE(checkpoint["model_cfg"])
+    model.load_state_dict(checkpoint["model"])
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+
+
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+    assert opt.eval_path is not None
+    eval_dataset = RetrievalEvalDataset(
+        dset_name=opt.dset_name,
+        eval_split_name=opt.eval_split_name,  # should only be val set
+        data_path=opt.eval_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        corpus_path=opt.corpus_path,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        ctx_mode=opt.ctx_mode,
+        data_mode="query",
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+    )
+
+    model = setup_model(opt)
+    save_submission_filename = \
+        "inference_{}_{}_{}_predictions_{}.json".format(
+            opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks))
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename, tasks=opt.tasks)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+
+if __name__ == '__main__':
+    start_inference()
diff --git a/baselines/mixture_embedding_experts/model.py b/baselines/mixture_embedding_experts/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee00aa564f675813be1528045235f90115480d99
--- /dev/null
+++ b/baselines/mixture_embedding_experts/model.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+from baselines.mixture_embedding_experts.model_components import NetVLAD, MaxMarginRankingLoss, GatedEmbeddingUnit
+from easydict import EasyDict as edict
+
+mee_base_cfg = edict(
+    ctx_mode="video",
+    text_input_size=768,
+    vid_input_size=1024,
+    output_size=256,
+    margin=0.2
+)
+
+
+class MEE(nn.Module):
+    def __init__(self, config):
+        super(MEE, self).__init__()
+        self.config = config
+        self.use_video = "video" in config.ctx_mode
+        self.use_sub = "sub" in config.ctx_mode
+
+        self.query_pooling = NetVLAD(feature_size=config.text_input_size, cluster_size=2)
+
+        if self.use_sub:
+            self.sub_query_gu = GatedEmbeddingUnit(input_dimension=self.query_pooling.out_dim,
+                                                   output_dimension=config.output_size)
+            self.sub_gu = GatedEmbeddingUnit(input_dimension=config.text_input_size,
+                                             output_dimension=config.output_size)
+
+        if self.use_video:
+            self.video_query_gu = GatedEmbeddingUnit(input_dimension=self.query_pooling.out_dim,
+                                                     output_dimension=config.output_size)
+            self.video_gu = GatedEmbeddingUnit(input_dimension=config.vid_input_size,
+                                               output_dimension=config.output_size)
+
+        if self.use_video and self.use_sub:
+            self.moe_fc = nn.Linear(self.query_pooling.out_dim, 2)  # weights
+
+        self.max_margin_loss = MaxMarginRankingLoss(margin=config.margin)
+
+    def forward(self, query_feat, query_mask, video_feat, sub_feat):
+        """
+        Args:
+            query_feat: (N, L, D_q)
+            query_mask: (N, L)
+            video_feat: (N, Dv)
+            sub_feat: (N, Dt)
+        """
+        pooled_query = self.query_pooling(query_feat)  # (N, Dt)
+        encoded_video, encoded_sub = self.encode_context(video_feat, sub_feat)
+        confusion_matrix = self.get_score_from_pooled_query_with_encoded_ctx(pooled_query, encoded_video, encoded_sub)
+        return self.max_margin_loss(confusion_matrix)
+
+    def encode_context(self, video_feat, sub_feat):
+        """(N, D)"""
+        encoded_video = self.video_gu(video_feat) if self.use_video else None
+        encoded_sub = self.sub_gu(sub_feat) if self.use_sub else None
+        return encoded_video, encoded_sub
+
+    def compute_single_stream_scores_with_encoded_ctx(self, pooled_query, encoded_ctx, module_name="video"):
+        encoded_query = getattr(self, module_name+"_query_gu")(pooled_query)  # (N, D)
+        return torch.einsum("md,nd->mn", encoded_query, encoded_ctx)  # (N, N)
+
+    def get_score_from_pooled_query_with_encoded_ctx(self, pooled_query, encoded_video, encoded_sub):
+        """Nq may not equal to Nc
+        Args:
+            pooled_query: (Nq, Dt)
+            encoded_video: (Nc, Dc)
+            encoded_sub: (Nc, Dc)
+        """
+
+        video_confusion_matrix = self.compute_single_stream_scores_with_encoded_ctx(
+            pooled_query, encoded_video, module_name="video") if self.use_video else 0
+        sub_confusion_matrix = self.compute_single_stream_scores_with_encoded_ctx(
+                pooled_query, encoded_sub, module_name="sub") if self.use_sub else 0
+
+        if self.use_video and self.use_sub:
+            stream_weights = self.moe_fc(pooled_query)  # (N, 2)
+            confusion_matrix = \
+                stream_weights[:, 0:1] * video_confusion_matrix + stream_weights[:, 1:2] * sub_confusion_matrix
+        else:
+            confusion_matrix = video_confusion_matrix + sub_confusion_matrix
+        return confusion_matrix  # (Nq, Nc)
+
diff --git a/baselines/mixture_embedding_experts/model_components.py b/baselines/mixture_embedding_experts/model_components.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5edefe523e8cf7274964009817337283d546bef
--- /dev/null
+++ b/baselines/mixture_embedding_experts/model_components.py
@@ -0,0 +1,103 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class GatedEmbeddingUnit(nn.Module):
+    def __init__(self, input_dimension, output_dimension):
+        super(GatedEmbeddingUnit, self).__init__()
+
+        self.fc = nn.Linear(input_dimension, output_dimension)
+        self.cg = ContextGating(output_dimension)
+
+    def forward(self, x):
+        x = self.fc(x)
+        x = self.cg(x)
+        x = F.normalize(x)
+        return x
+
+
+class ContextGating(nn.Module):
+    def __init__(self, dimension, add_batch_norm=True):
+        super(ContextGating, self).__init__()
+        self.fc = nn.Linear(dimension, dimension)
+        self.add_batch_norm = add_batch_norm
+        self.batch_norm = nn.BatchNorm1d(dimension)
+
+    def forward(self, x):
+        x1 = self.fc(x)
+
+        if self.add_batch_norm:
+            x1 = self.batch_norm(x1)
+
+        x = torch.cat((x, x1), 1)
+        return F.glu(x, 1)
+
+
+class MaxMarginRankingLoss(nn.Module):
+    def __init__(self, margin=1):
+        super(MaxMarginRankingLoss, self).__init__()
+        self.margin = margin
+
+    def forward(self, x):
+        n = x.size()[0]
+
+        x1 = torch.diag(x)
+        x1 = x1.unsqueeze(1)
+        x1 = x1.expand(n, n)
+        x1 = x1.contiguous().view(-1, 1)
+        x1 = torch.cat((x1, x1), 0)
+
+        x2 = x.view(-1, 1)
+        x3 = x.transpose(0, 1).contiguous().view(-1, 1)
+
+        x2 = torch.cat((x2, x3), 0)
+
+        max_margin = F.relu(self.margin - (x1 - x2))
+        return max_margin.mean()
+
+
+class NetVLAD(nn.Module):
+    def __init__(self, cluster_size, feature_size, add_batch_norm=True):
+        super(NetVLAD, self).__init__()
+        self.feature_size = feature_size
+        self.cluster_size = cluster_size
+        self.clusters = nn.Parameter((1 / math.sqrt(feature_size))
+                                     * torch.randn(feature_size, cluster_size))
+        self.clusters2 = nn.Parameter((1 / math.sqrt(feature_size))
+                                      * torch.randn(1, feature_size, cluster_size))
+
+        self.add_batch_norm = add_batch_norm
+        self.batch_norm = nn.BatchNorm1d(cluster_size)
+        self.out_dim = cluster_size * feature_size
+
+    def forward(self, x):
+        max_sample = x.size()[1]
+        x = x.view(-1, self.feature_size)
+        assignment = torch.matmul(x, self.clusters)
+
+        if self.add_batch_norm:
+            assignment = self.batch_norm(assignment)
+
+        assignment = F.softmax(assignment, dim=1)
+        assignment = assignment.view(-1, max_sample, self.cluster_size)
+
+        a_sum = torch.sum(assignment, -2, keepdim=True)
+        a = a_sum * self.clusters2
+
+        assignment = assignment.transpose(1, 2)
+
+        x = x.view(-1, max_sample, self.feature_size)
+        vlad = torch.matmul(assignment, x)
+        vlad = vlad.transpose(1, 2)
+        vlad = vlad - a
+
+        # L2 intra norm
+        vlad = F.normalize(vlad)
+
+        # flattening + L2 norm
+        vlad = vlad.view(-1, self.cluster_size * self.feature_size)
+        vlad = F.normalize(vlad)
+
+        return vlad
diff --git a/baselines/mixture_embedding_experts/retrieval_dataset.py b/baselines/mixture_embedding_experts/retrieval_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e775f7d2aaa8594c1a9902070d23337ec760b74
--- /dev/null
+++ b/baselines/mixture_embedding_experts/retrieval_dataset.py
@@ -0,0 +1,283 @@
+"""
+Dataset for clip model
+"""
+import logging
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import h5py
+from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array, flat_list_of_lists, merge_dicts
+from utils.tensor_utils import pad_sequences_1d
+
+logger = logging.getLogger(__name__)
+
+
+class RetrievalDataset(Dataset):
+    """
+    Args:
+        dset_name, str, ["tvr"]
+        ctx_mode: str,
+    Return:
+        a dict: {
+            "meta": {
+                "query_id": int,
+                "desc": str,
+                "vid_name": str,
+                "duration": float,
+                "ts": [st (float), ed (float)], seconds, ground_truth timestamps
+            }
+            "model_inputs": {
+                "query_feat": torch.tensor, (L, D_q)
+                "video_feat": torch.tensor, (n_clip_in_moment, D_video)
+                "sub_feat": torch.tensor, (n_clip_in_moment, D_sub)
+                "st_ed_indices": torch.LongTensor, (2, )
+            }
+        }
+    """
+    def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler,
+                 vid_feat_path_or_handler, max_desc_len, max_ctx_len, ctx_mode="video",
+                 normalize_vfeat=True, normalize_tfeat=True, h5driver=None, data_ratio=1.0):
+        self.dset_name = dset_name
+        self.data_path = data_path
+        self.data_ratio = data_ratio
+        self.max_desc_len = max_desc_len
+        self.max_ctx_len = max_ctx_len
+
+        self.desc_bert_path_or_handler = desc_bert_path_or_handler
+        self.sub_bert_path_or_handler = sub_bert_path_or_handler
+        self.vid_feat_path_or_handler = vid_feat_path_or_handler
+        self.ctx_mode = ctx_mode
+
+        # prepare desc data
+        self.data = load_jsonl(data_path)
+        if self.data_ratio != 1:
+            n_examples = int(len(self.data) * data_ratio)
+            self.data = self.data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        raw_data = self.data[index]
+
+        # initialize with basic data
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+        )
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+
+        ctx_l = 0
+        if self.use_video:
+            video_feat = np.mean(self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len], axis=0)  # (D, )
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros(2)
+
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = np.mean(self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len], axis=0)  # (N_clips, D_t)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros(2)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+
+
+class RetrievalEvalDataset(Dataset):
+    """
+    init_data_mode: `video_query` or `video_only` or `query_only`,
+        it indicates which data to load when initialize the Dataset object.
+    data_mode: `context` or `query`, it indicates which data to return for self.__get_item__()
+    desc_bert_path_or_handler: h5py.File object or str path
+    vid_feat_path_or_handler: h5py.File object or str path
+    eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with
+        max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals.
+    load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval.
+    data_ratio: percentage of query data to use.
+    """
+    def __init__(self, dset_name, eval_split_name, data_path=None,
+                 desc_bert_path_or_handler=None, max_desc_len=None,  max_ctx_len=None,
+                 sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
+                 corpus_path=None, ctx_mode="video", data_mode="context",
+                 h5driver=None, data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True):
+        self.dset_name = dset_name
+        self.eval_split_name = eval_split_name
+        self.ctx_mode = ctx_mode
+        self.load_gt_video = False
+        self.data_ratio = data_ratio  # only affect query data
+        self.normalize_vfeat = normalize_vfeat
+        self.normalize_tfeat = normalize_tfeat
+
+        self.data_mode = None
+        self.set_data_mode(data_mode)
+
+        self.max_desc_len = max_desc_len
+        self.max_ctx_len = max_ctx_len
+        self.data_path = data_path
+        self.query_data = load_jsonl(data_path)
+        if data_ratio != 1:
+            n_examples = int(len(self.query_data) * data_ratio)
+            self.query_data = self.query_data[:n_examples]
+            logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
+        if isinstance(desc_bert_path_or_handler, h5py.File):
+            self.desc_bert_h5 = desc_bert_path_or_handler
+        else:
+            self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
+
+        video_data = load_json(corpus_path)[self.eval_split_name]
+        self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()]
+        self.video2idx = {k: v[1] for k, v in video_data.items()}
+
+        self.use_video = "video" in self.ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+        self.use_tef = "tef" in self.ctx_mode
+
+        if self.use_video:
+            if isinstance(vid_feat_path_or_handler, h5py.File):
+                self.vid_feat_h5 = vid_feat_path_or_handler
+            else:  # str path
+                self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
+
+        if self.use_sub:
+            if isinstance(sub_bert_path_or_handler, h5py.File):
+                self.sub_bert_h5 = sub_bert_path_or_handler
+            else:  # str path
+                self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
+
+    def set_data_mode(self, data_mode):
+        """context or query"""
+        assert data_mode in ["context", "query"]
+        self.data_mode = data_mode
+
+    def load_gt_vid_name_for_query(self, load_gt_video):
+        """load_gt_video: bool, affect the returned value of self._get_item_query"""
+        assert "vid_name" in self.query_data[0]
+        self.load_gt_video = load_gt_video
+
+    def __len__(self):
+        if self.data_mode == "context":
+            return len(self.video_data)
+        else:
+            return len(self.query_data)
+
+    def __getitem__(self, index):
+        if self.data_mode == "context":
+            return self._get_item_context(index)
+        else:
+            return self._get_item_query(index)
+
+    def get_query_feat_by_query_id(self, query_id):
+        query_feat = self.desc_bert_h5[str(query_id)][:self.max_desc_len]
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+        return torch.from_numpy(query_feat)
+
+    def _get_item_query(self, index):
+        """Need to batch"""
+        raw_data = self.query_data[index]
+
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["desc"],
+            vid_name=raw_data["vid_name"] if self.load_gt_video else None
+        )
+
+        model_inputs = dict()
+        model_inputs["query_feat"] = self.get_query_feat_by_query_id(meta["query_id"])
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def _get_item_context(self, index):
+        """No need to batch, since it has already been batched here"""
+        raw_data = self.video_data[index]
+
+        # initialize with basic data
+        meta = dict(
+            vid_name=raw_data["vid_name"],
+            duration=raw_data["duration"],
+        )
+
+        model_inputs = dict()
+
+        if self.use_video:
+            video_feat = np.mean(self.vid_feat_h5[meta["vid_name"]][:self.max_ctx_len], axis=0)  # (1, D)
+            if self.normalize_vfeat:
+                video_feat = l2_normalize_np_array(video_feat)
+            model_inputs["video_feat"] = torch.from_numpy(video_feat)
+        else:
+            model_inputs["video_feat"] = torch.zeros(2)
+
+        if self.use_sub:  # no need for ctx feature, as the features are already contextulized
+            sub_feat = np.mean(self.sub_bert_h5[meta["vid_name"]][:self.max_ctx_len], axis=0)
+            if self.normalize_tfeat:
+                sub_feat = l2_normalize_np_array(sub_feat)
+            model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
+        else:
+            model_inputs["sub_feat"] = torch.zeros(2)
+        return dict(meta=meta, model_inputs=model_inputs)
+
+
+def retrieval_collate(batch):
+    batch_meta = [e["meta"] for e in batch]  # seems no need to collate ?
+
+    model_inputs_keys = batch[0]["model_inputs"].keys()
+    batched_data = dict()
+    for k in model_inputs_keys:
+        if k == "query_feat":
+            batched_data[k] = pad_sequences_1d(
+                [e["model_inputs"][k] for e in batch], dtype=torch.float32, fixed_length=None)
+        elif "feat" in k:
+            batched_data[k] = torch.stack([e["model_inputs"][k] for e in batch])
+    return batch_meta, batched_data
+
+
+def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False):
+    model_inputs = {}
+    for k, v in batched_model_inputs.items():
+        if k == "query_feat":
+            model_inputs[k] = v[0].to(device, non_blocking=non_blocking)
+            model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking)
+        else:
+            model_inputs[k] = v.to(device, non_blocking=non_blocking)
+    return model_inputs
+
+
+if __name__ == '__main__':
+    from baselines.crossmodal_moment_localization.config import BaseOptions
+    options = BaseOptions().parse()
diff --git a/baselines/mixture_embedding_experts/scripts/inference.sh b/baselines/mixture_embedding_experts/scripts/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fca00159bae71e178e30b3ff7040e97329670469
--- /dev/null
+++ b/baselines/mixture_embedding_experts/scripts/inference.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/mixture_embedding_experts/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
+model_dir=$1
+eval_split_name=$2  # [val ]
+eval_path=data/tvr_${eval_split_name}_release.jsonl
+tasks=()
+tasks+=(VR)
+echo "tasks ${tasks[@]}"
+python baselines/mixture_embedding_experts/inference.py \
+--model_dir ${model_dir} \
+--tasks ${tasks[@]} \
+--eval_split_name ${eval_split_name} \
+--eval_path ${eval_path} \
+${@:3}
diff --git a/baselines/mixture_embedding_experts/scripts/train.sh b/baselines/mixture_embedding_experts/scripts/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..53cd2504ee09f5f2f1a45f1dfa764b64416d2eb7
--- /dev/null
+++ b/baselines/mixture_embedding_experts/scripts/train.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+# run at project root dir
+# Usage:
+# bash baselines/clip_alignment_with_language/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS
+dset_name=$1  # see case below
+ctx_mode=$2  # ["video", "sub", "tef", "video_sub", "video_tef", "sub_tef", "video_sub_tef"]
+vid_feat_type=$3  # [resnet, i3d, resnet_i3d, none] , none for subtitles only models
+feature_root=data/tvr_feature_release
+results_root=baselines/mixture_embedding_experts/results
+vid_feat_size=2048
+extra_args=()
+
+if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+    if [[ ${dset_name} != "tvr" ]]; then
+        echo "The use of subtitles is only supported in tvr."
+        exit 1
+    fi
+fi
+
+
+case ${dset_name} in
+    tvr)
+        train_path=data/tvr_train_release.jsonl
+        corpus_path=data/tvr_video2dur_idx.json
+        desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5
+        vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+        clip_length=1.5
+        eval_split_name=val
+        nms_thd=-1
+        extra_args+=(--eval_path)
+        extra_args+=(data/tvr_val_release.jsonl)
+
+        if [[ ${vid_feat_type} == "i3d" ]]; then
+            echo "Using I3D feature with shape 1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5
+            vid_feat_size=1024
+        elif [[ ${vid_feat_type} == "resnet" ]]; then
+            echo "Using ResNet feature with shape 2048"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
+            vid_feat_size=2048
+        elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then
+            echo "Using concatenated ResNet and I3D feature with shape 2048+1024"
+            vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5
+            vid_feat_size=3072
+            extra_args+=(--no_norm_vfeat)  # since they are already normalized.
+        fi
+
+        if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
+            echo "Running with sub."
+            desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5  # overwrite
+            sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5
+            sub_feat_size=768
+            extra_args+=(--sub_feat_size)
+            extra_args+=(${sub_feat_size})
+            extra_args+=(--sub_bert_path)
+            extra_args+=(${sub_bert_path})
+        fi
+        ;;
+    *)
+        echo -n "Unknown argument"
+        ;;
+esac
+
+echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]"
+echo "Extra args ${extra_args[@]}"
+python baselines/mixture_embedding_experts/train.py \
+--dset_name=${dset_name} \
+--eval_split_name=${eval_split_name} \
+--results_root=${results_root} \
+--train_path=${train_path} \
+--desc_bert_path=${desc_bert_path} \
+--corpus_path=${corpus_path} \
+--vid_feat_path=${vid_feat_path} \
+--vid_feat_size=${vid_feat_size} \
+--ctx_mode=${ctx_mode} \
+${extra_args[@]} \
+${@:4}
diff --git a/baselines/mixture_embedding_experts/train.py b/baselines/mixture_embedding_experts/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..303e59c4e458235df01f2c3ad9969ad3a88e5dc1
--- /dev/null
+++ b/baselines/mixture_embedding_experts/train.py
@@ -0,0 +1,280 @@
+import os
+import time
+import json
+import pprint
+import random
+import numpy as np
+from collections import OrderedDict
+from easydict import EasyDict as EDict
+from tqdm import tqdm, trange
+
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+
+from baselines.mixture_embedding_experts.config import BaseOptions
+from baselines.mixture_embedding_experts.model import MEE
+from baselines.mixture_embedding_experts.retrieval_dataset import \
+    RetrievalDataset, retrieval_collate, RetrievalEvalDataset, prepare_batch_inputs
+from baselines.mixture_embedding_experts.inference import eval_epoch, start_inference
+from utils.basic_utils import save_jsonl, save_json, AverageMeter
+from utils.model_utils import count_parameters
+
+
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def set_seed(seed, use_cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if use_cuda:
+        torch.cuda.manual_seed_all(seed)
+
+
+def train_epoch(model, train_loader, optimizer, opt, epoch_i):
+    model.train()
+
+    # init meters
+    dataloading_time = AverageMeter()
+    prepare_inputs_time = AverageMeter()
+    model_forward_time = AverageMeter()
+    model_backward_time = AverageMeter()
+    loss_meter = AverageMeter()
+
+    num_training_examples = len(train_loader)
+    timer_dataloading = time.time()
+    for batch_idx, batch in tqdm(enumerate(train_loader),
+                                 desc="Training Iteration",
+                                 total=num_training_examples):
+        dataloading_time.update(time.time() - timer_dataloading)
+
+        # continue
+        timer_start = time.time()
+        model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory)
+        prepare_inputs_time.update(time.time() - timer_start)
+        timer_start = time.time()
+        loss = model(**model_inputs)
+        model_forward_time.update(time.time() - timer_start)
+        timer_start = time.time()
+        optimizer.zero_grad()
+        loss.backward()
+        if opt.grad_clip != -1:
+            nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
+        optimizer.step()
+        model_backward_time.update(time.time() - timer_start)
+
+        global_step = epoch_i * num_training_examples + batch_idx
+        opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step)
+        opt.writer.add_scalar("Train/Loss", float(loss), global_step)
+        loss_meter.update(float(loss))
+
+        timer_dataloading = time.time()
+        if opt.debug and batch_idx == 3:
+            break
+    to_write = opt.train_log_txt_formatter.format(
+        time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+        epoch=epoch_i,
+        loss_str=str(loss_meter.avg))
+    with open(opt.train_log_filepath, "a") as f:
+        f.write(to_write)
+    print("Epoch time stats:")
+    print("dataloading_time: max {dataloading_time.max} "
+          "min {dataloading_time.min} avg {dataloading_time.avg}\n"
+          "prepare_inputs_time: max {prepare_inputs_time.max} "
+          "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n"
+          "model_forward_time: max {model_forward_time.max} "
+          "min {model_forward_time.min} avg {model_forward_time.avg}\n"
+          "model_backward_time: max {model_backward_time.max} "
+          "min {model_backward_time.min} avg {model_backward_time.avg}\n"
+          "".format(dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time,
+                    model_forward_time=model_forward_time, model_backward_time=model_backward_time))
+
+
+def train(model, train_dataset, val_dataset, opt):
+    # Prepare optimizer
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        if len(opt.device_ids) > 1:
+            logger.info("Use multi GPU", opt.device_ids)
+            model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+
+    optimizer = torch.optim.Adam(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=opt.lr)
+    # reduce the lr by 0.1 every 30 epochs
+    scheduler = torch.optim.lr_scheduler.ExponentialLR(
+        optimizer,
+        gamma=0.95
+    )
+
+    train_loader = DataLoader(train_dataset,
+                              collate_fn=retrieval_collate,
+                              batch_size=opt.bsz,
+                              num_workers=opt.num_workers,
+                              shuffle=True,
+                              pin_memory=opt.pin_memory)
+
+    prev_best_score = 0.
+    es_cnt = 0
+    start_epoch = -1 if opt.eval_untrained else 0
+    eval_tasks_at_training = ["VR"]
+    save_submission_filename = \
+        "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, "_".join(eval_tasks_at_training))
+    for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"):
+        if epoch_i > -1:
+            with torch.autograd.detect_anomaly():
+                train_epoch(model, train_loader, optimizer, opt, epoch_i)
+        global_step = (epoch_i + 1) * len(train_loader)
+        scheduler.step()
+        if opt.eval_path is not None:
+            with torch.no_grad():
+                metrics_no_nms, metrics_nms, latest_file_paths = \
+                    eval_epoch(model, val_dataset, opt, save_submission_filename, tasks=eval_tasks_at_training)
+            logger.info("metrics_no_nms {}".format(
+                pprint.pformat(rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4)))
+            logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+            to_write = opt.eval_log_txt_formatter.format(
+                time_str=time.strftime("%Y_%m_%d_%H_%M_%S"),
+                epoch=epoch_i,
+                eval_metrics_str=json.dumps(metrics_no_nms))
+            with open(opt.eval_log_filepath, "a") as f:
+                f.write(to_write)
+
+            # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms
+            metrics = metrics_no_nms
+            # early stop/ log / save model
+            for task_type, task_metrics in metrics.items():
+                for iou_thd in [0.5, 0.7]:
+                    opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd),
+                                           {k: v for k, v in task_metrics.items() if str(iou_thd) in k},
+                                           global_step)
+
+            # use the most strict metric available
+            if metrics["VR"]["r1"] > prev_best_score:
+                es_cnt = 0
+                prev_best_score = metrics["VR"]["r1"]
+
+                checkpoint = {
+                    "model": model.state_dict(),
+                    "model_cfg": model.config,
+                    "epoch": epoch_i}
+                torch.save(checkpoint, opt.ckpt_filepath)
+
+                best_file_paths = [e.replace("latest", "best") for e in latest_file_paths]
+                for src, tgt in zip(latest_file_paths, best_file_paths):
+                    os.renames(src, tgt)
+                logger.info("The checkpoint file has been updated.")
+            else:
+                es_cnt += 1
+                if es_cnt > opt.max_es_cnt:  # early stop
+                    with open(opt.train_log_filepath, "a") as f:
+                        f.write("Early Stop at epoch {}".format(epoch_i))
+                    logger.info("Early stop at {} with VR r1 {}".format(epoch_i, prev_best_score))
+                    break
+        else:
+            checkpoint = {
+                "model": model.state_dict(),
+                "model_cfg": model.config,
+                "epoch": epoch_i}
+            torch.save(checkpoint, opt.ckpt_filepath)
+
+        if opt.debug:
+            break
+
+    opt.writer.close()
+
+
+def rm_key_from_odict(odict_obj, rm_suffix):
+    """remove key entry from the OrderedDict"""
+    return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k])
+
+
+def start_training():
+    logger.info("Setup config, data and model...")
+    opt = BaseOptions().parse()
+    set_seed(opt.seed)
+    if opt.debug:  # keep the model run deterministically
+        # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
+        # Enable this only when input size is fixed.
+        cudnn.benchmark = False
+        cudnn.deterministic = True
+
+    opt.writer = SummaryWriter(opt.tensorboard_log_dir)
+    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
+    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"
+
+    train_dataset = RetrievalDataset(
+        dset_name=opt.dset_name,
+        data_path=opt.train_path,
+        desc_bert_path_or_handler=opt.desc_bert_path,
+        sub_bert_path_or_handler=opt.sub_bert_path,
+        vid_feat_path_or_handler=opt.vid_feat_path,
+        max_desc_len=opt.max_desc_l,
+        max_ctx_len=opt.max_ctx_l,
+        ctx_mode=opt.ctx_mode,
+        h5driver=opt.h5driver,
+        data_ratio=opt.data_ratio,
+        normalize_vfeat=not opt.no_norm_vfeat,
+        normalize_tfeat=not opt.no_norm_tfeat,
+    )
+
+    if opt.eval_path is not None:
+        eval_dataset = RetrievalEvalDataset(
+            dset_name=opt.dset_name,
+            eval_split_name=opt.eval_split_name,  # should only be val set
+            data_path=opt.eval_path,
+            desc_bert_path_or_handler=train_dataset.desc_bert_h5,
+            sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
+            max_desc_len=opt.max_desc_l,
+            max_ctx_len=opt.max_ctx_l,
+            corpus_path=opt.corpus_path,
+            vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
+            ctx_mode=opt.ctx_mode,
+            data_mode="query",
+            h5driver=opt.h5driver,
+            data_ratio=opt.data_ratio,
+            normalize_vfeat=not opt.no_norm_vfeat,
+            normalize_tfeat=not opt.no_norm_tfeat,
+        )
+    else:
+        eval_dataset = None
+
+    model_config = EDict(
+        ctx_mode=opt.ctx_mode,
+        text_input_size=opt.sub_feat_size,
+        vid_input_size=opt.vid_feat_size,  #
+        output_size=opt.output_size,
+        margin=opt.margin,  # margin for ranking loss
+    )
+    logger.info("model_config {}".format(model_config))
+    model = MEE(model_config)
+    count_parameters(model)
+    logger.info("Start Training...")
+    train(model, train_dataset, eval_dataset, opt)
+    return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
+
+
+if __name__ == '__main__':
+    model_dir, eval_split_name, eval_path, debug = start_training()
+    if not debug:
+        model_dir = model_dir.split(os.sep)[-1]
+        tasks = ["VR"]
+        input_args = ["--model_dir", model_dir,
+                      "--eval_split_name", eval_split_name,
+                      "--eval_path", eval_path,
+                      "--tasks"] + tasks
+
+        import sys
+        sys.argv[1:] = input_args
+        logger.info("\n\n\nFINISHED TRAINING!!!")
+        logger.info("Evaluating model in {}".format(model_dir))
+        start_inference()
diff --git a/baselines/profiling/README.md b/baselines/profiling/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a15400bc8123c78a740e0d19f8bd81041395914
--- /dev/null
+++ b/baselines/profiling/README.md
@@ -0,0 +1,5 @@
+# Profiling
+
+### Additional Requirements:
+- [FAISS](https://github.com/facebookresearch/faiss/) for nearest neighbor search, 
+install it by `pip install faiss-gpu==1.6.1`.
diff --git a/baselines/profiling/profile_main.py b/baselines/profiling/profile_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..76577690697937632ce3d2513bc5e45ebfbe48a6
--- /dev/null
+++ b/baselines/profiling/profile_main.py
@@ -0,0 +1,485 @@
+"""
+Profile the time needed for retrieval.
+We consider retrieval in a corpus of 1M videos, 1K videos are added, 10K queries are retrieved.
+Calculate the time needed for adding 1K videos, and performing retrieval for 10K queries.
+
+1, Data Loading time is ignored, consider it is hidden by computation time.
+2, Sort time is ignored, since it is the similar among the methods.
+"""
+import os
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pprint
+from tqdm import tqdm, trange
+from baselines.crossmodal_moment_localization.model_xml import XML, xml_base_config
+from baselines.mixture_embedding_experts.model import MEE, mee_base_cfg
+from baselines.clip_alignment_with_language.model import CALWithSub, cal_base_cfg
+from baselines.excl.model import EXCL, excl_base_cfg
+from utils.basic_utils import save_json
+
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+
+def mask_logits(target, mask):
+    return target * mask + (1 - mask) * (-1e10)
+
+
+class ProfileBase(object):
+    N_NewQuery = 1e4
+    N_NewVideo = 1e3
+    N_Videos = 1e6
+    AvgVideoLength = 100
+    ClipLength = 5
+    AvgClipPerVideo = int(AvgVideoLength / ClipLength)  # max_ctx_l
+    AvgWordInQuery = 15
+    # estimated by
+    # scales=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],  => max_proposal = 14
+    AvgProposalPerVideo = 170
+    MaxClipPerProposal = 14  # pad to this length
+    AvgClipPerProposal = 7  # 6.88
+
+    VideoFeatureDim = 3074  # 1024 + 2048 + 2 (TEF)
+    SubFeatureDim = 770
+    QueryFeatureDim = 768
+
+    HiddenSize = 256
+    N_Runs = 5  # Get the average time
+
+    def __init__(self, device=torch.device("cuda:0"), ctx_batch_size=400, query_batch_size=100):
+        self.device = device
+        self.ctx_batch_size = ctx_batch_size
+        self.query_batch_size = query_batch_size
+        self.model_config = self.get_model_config()
+        print(self.model_config)
+        self.model = self.get_model()
+
+    def get_model(self):
+        return None
+
+    def get_model_config(self):
+        return None
+
+    def set_ctx_batch_size(self, batch_size):
+        self.ctx_batch_size = batch_size
+
+    def set_query_batch_size(self, batch_size):
+        self.query_batch_size = batch_size
+
+    def cast_dict_inputs_to_device(self, dict_inputs, device):
+        return {k: v.to(device) for k, v in dict_inputs.items()}
+
+    def get_fake_ctx_raw_input_st_ed(self, no_tef=False):
+        return dict(
+            video_feat=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l,
+                                         self.VideoFeatureDim - 2*no_tef),
+            sub_feat=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l, self.SubFeatureDim - 2*no_tef),
+            ctx_mask=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l),
+        )
+
+    def get_fake_raw_query(self):
+        return dict(
+            query_feat=torch.FloatTensor(self.query_batch_size, self.AvgWordInQuery, self.QueryFeatureDim),
+            query_mask=torch.ones(self.query_batch_size, self.AvgWordInQuery)
+        )
+
+
+"""
+from baselines.profiling.profile_main import ProfileXML
+profile_xml = ProfileXML(ctx_batch_size=400, query_batch_size=100)
+profile_xml.get_ctx_encoding_time()
+"""
+
+
+class ProfileXML(ProfileBase):
+    def get_model_config(self):
+        xml_base_config["ctx_mode"] = "video_sub_tef"
+        xml_base_config["merge_two_stream"] = True
+        xml_base_config["cross_att"] = True
+        xml_base_config["max_ctx_l"] = self.AvgClipPerVideo
+        xml_base_config["visual_input_size"] = self.VideoFeatureDim
+        xml_base_config["query_input_size"] = self.QueryFeatureDim
+        xml_base_config["sub_input_size"] = self.SubFeatureDim
+        xml_base_config["hidden_size"] = self.HiddenSize
+        return xml_base_config
+
+    def get_model(self):
+        model = XML(self.model_config)
+        model.to(self.device)
+        model.eval()
+        return model
+
+    def get_fake_encoded_ctx(self):
+        return dict(
+            ctx_feat=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l, self.HiddenSize),
+            ctx_mask=torch.FloatTensor(self.ctx_batch_size, self.model_config.max_ctx_l),
+        )
+
+    def get_fake_encoded_query(self):
+        return dict(query_feat=torch.FloatTensor(self.ctx_batch_size, self.HiddenSize))
+
+    def _get_ctx_encoding_time(self, video_feat, sub_feat, ctx_mask):
+        """Considered two modalities"""
+        torch.cuda.synchronize()
+        st_time = time.time()
+        self.model.cross_encode_context(video_feat, ctx_mask, sub_feat, ctx_mask)
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_ctx_encoding_time(self):
+        with torch.no_grad():
+            fake_ctx_inputs = self.cast_dict_inputs_to_device(self.get_fake_ctx_raw_input_st_ed(), self.device)
+            raw_video = fake_ctx_inputs["video_feat"]
+            raw_sub = fake_ctx_inputs["sub_feat"]
+            ctx_mask = fake_ctx_inputs["ctx_mask"]
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_ctx_encoding_time(raw_video, raw_sub, ctx_mask)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+    def _get_query_encoding_time(self, raw_query, query_mask):
+        """Considered two modalities"""
+        torch.cuda.synchronize()
+        st_time = time.time()
+        encoded_query = self.model.encode_input(raw_query, query_mask,
+                                                self.model.query_input_proj,
+                                                self.model.query_encoder,
+                                                self.model.query_pos_embed)  # (N, Lq, D)
+        # video level
+        video_query, sub_query = \
+            self.model.get_modularized_queries(encoded_query, query_mask, return_modular_att=False)
+        # st ed
+        video_query = self.model.video_query_linear(video_query)
+        sub_query = self.model.sub_query_linear(sub_query)
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_query_encoding_time(self):
+        with torch.no_grad():
+            query_inputs = self.cast_dict_inputs_to_device(self.get_fake_raw_query(), self.device)
+            raw_query = query_inputs["query_feat"]
+            query_mask = query_inputs["query_mask"]
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_query_encoding_time(raw_query, query_mask)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+    def _get_retrieval_time(self, encoded_video_query, encoded_video, ctx_mask):
+        """Consider the queries are encoded, Calculate in a single modality then multiply by 2."""
+        torch.cuda.synchronize()
+        st_time = time.time()
+        self.model.get_video_level_scores(encoded_video_query, encoded_video, ctx_mask)
+        torch.cuda.synchronize()
+        return (time.time() - st_time) * 2
+
+    def get_retrieval_time(self):
+        with torch.no_grad():
+            encoded_query = self.cast_dict_inputs_to_device(self.get_fake_encoded_query(), self.device)["query_feat"]
+            fake_ctx_inputs = self.cast_dict_inputs_to_device(self.get_fake_encoded_ctx(), self.device)
+            encoded_ctx = fake_ctx_inputs["ctx_feat"]
+            ctx_mask = fake_ctx_inputs["ctx_mask"]
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_retrieval_time(encoded_query, encoded_ctx, ctx_mask)]
+            times = torch.FloatTensor(times)  # since we have two modalities
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+    def _get_span_prediction_time(self, query_feat, ctx_feat, ctx_mask):
+        """Considered two modalities"""
+        torch.cuda.synchronize()
+        st_time = time.time()
+        similarity = torch.einsum("md,nld->mnl", query_feat, ctx_feat)
+        similarity = (similarity + similarity) / 2  # (Nq, Nv, L)  from query to all videos.
+        n_q, n_c, l = similarity.shape
+        similarity = similarity.view(n_q * n_c, 1, l)
+        st_prob = self.model.merged_st_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+        ed_prob = self.model.merged_ed_predictor(similarity).view(n_q, n_c, l)  # (Nq, Nv, L)
+        st_prob = mask_logits(st_prob, ctx_mask)  # (N, L)
+        ed_prob = mask_logits(ed_prob, ctx_mask)
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_span_prediction_time(self):
+        with torch.no_grad():
+            encoded_query = self.cast_dict_inputs_to_device(self.get_fake_encoded_query(), self.device)["query_feat"]
+            fake_ctx_inputs = self.cast_dict_inputs_to_device(self.get_fake_encoded_ctx(), self.device)
+            encoded_ctx = fake_ctx_inputs["ctx_feat"]
+            ctx_mask = fake_ctx_inputs["ctx_mask"]
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_span_prediction_time(encoded_query, encoded_ctx, ctx_mask)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+
+"""
+from baselines.profiling.profile_main import ProfileMEE
+profile_mee = ProfileMEE(ctx_batch_size=400, query_batch_size=100)
+profile_mee.get_ctx_encoding_time()
+"""
+
+
+class ProfileMEE(ProfileBase):
+    def get_model_config(self):
+        mee_base_cfg["ctx_mode"] = "video_sub"
+        mee_base_cfg["text_input_size"] = self.QueryFeatureDim
+        mee_base_cfg["vid_input_size"] = self.VideoFeatureDim
+        mee_base_cfg["output_size"] = self.HiddenSize
+        return mee_base_cfg
+
+    def get_model(self):
+        model = MEE(self.model_config)
+        model.to(self.device)
+        model.eval()
+        return model
+
+    def get_fake_raw_ctx(self):
+        return dict(
+            vid_feat=torch.FloatTensor(self.ctx_batch_size, self.VideoFeatureDim),
+            sub_feat=torch.FloatTensor(self.ctx_batch_size, self.QueryFeatureDim)
+        )
+
+    def get_fake_encoded_ctx_query(self):
+        return dict(
+            ctx_feat=torch.FloatTensor(self.ctx_batch_size, self.HiddenSize),
+            query_feat=torch.FloatTensor(self.ctx_batch_size, self.HiddenSize)
+        )
+
+    def _get_ctx_encoding_time(self, vid_feat, sub_feat):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        self.model.video_gu(vid_feat)
+        self.model.sub_gu(sub_feat)
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_ctx_encoding_time(self):
+        feat_dict = self.cast_dict_inputs_to_device(self.get_fake_raw_ctx(), self.device)
+        with torch.no_grad():
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_ctx_encoding_time(**feat_dict)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+    def _get_query_encoding_time(self, query_feat):
+        """Considered 2 modalities"""
+        torch.cuda.synchronize()
+        st_time = time.time()
+        pooled_query = self.model.query_pooling(query_feat)  # (N, Dt)
+        video_query = self.model.video_query_gu(pooled_query)
+        sub_query = self.model.sub_query_gu(pooled_query)
+        stream_weights = self.model.moe_fc(pooled_query)  # (N, 2)
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_query_encoding_time(self):
+        raw_query = self.cast_dict_inputs_to_device(self.get_fake_raw_query(), self.device)["query_feat"]
+        with torch.no_grad():
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_query_encoding_time(raw_query)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+    def _get_retrieval_time(self, encoded_query, encoded_ctx):
+        """Considered 2 modalities"""
+        torch.cuda.synchronize()
+        st_time = time.time()
+        torch.einsum("md,nd->mn", encoded_query, encoded_ctx)  # (N, N)
+        torch.cuda.synchronize()
+        return (time.time() - st_time) * 2
+
+    def get_retrieval_time(self):
+        model_inputs = self.cast_dict_inputs_to_device(self.get_fake_encoded_ctx_query(), self.device)
+        encoded_query = model_inputs["ctx_feat"]
+        encoded_ctx = model_inputs["query_feat"]
+        with torch.no_grad():
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_retrieval_time(encoded_query, encoded_ctx)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+
+class ProfileCAL(ProfileBase):
+    def get_model_config(self):
+        cal_base_cfg["ctx_mode"] = "video_sub"
+        cal_base_cfg["embedding_size"] = self.QueryFeatureDim
+        cal_base_cfg["visual_input_size"] = self.VideoFeatureDim * 2
+        cal_base_cfg["textual_input_size"] = self.SubFeatureDim * 2
+        cal_base_cfg["output_size"] = self.HiddenSize
+        return cal_base_cfg
+
+    def get_model(self):
+        model = CALWithSub(self.model_config)
+        model.to(self.device)
+        model.eval()
+        return model
+
+    def get_fake_raw_ctx(self, model_name="cal"):
+        """The features are `*2` since they use both global and local features"""
+        return dict(
+            sub_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgProposalPerVideo,
+                                       self.AvgClipPerProposal, self.SubFeatureDim * 2),
+            vid_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgProposalPerVideo,
+                                       self.AvgClipPerProposal, self.VideoFeatureDim * 2))
+
+    def _get_ctx_encoding_time(self, sub_feat, vid_feat, model_name="cal"):
+        if model_name == "mcn":
+            sub_feat = sub_feat.sum(2)
+            vid_feat = vid_feat.sum(2)
+        torch.cuda.synchronize()
+        st_time = time.time()
+        self.model.moment_encoder(vid_feat, module_name="video")
+        self.model.moment_encoder(sub_feat, module_name="sub")
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_ctx_encoding_time(self, model_name="cal"):
+        """model_name: str, `cal` or `mcn`"""
+        feat_dict = self.cast_dict_inputs_to_device(
+            self.get_fake_raw_ctx(model_name=model_name), self.device)
+        feat_dict["model_name"] = model_name
+        with torch.no_grad():
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_ctx_encoding_time(**feat_dict)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+    def _get_query_encoding_time(self, query_feat, query_mask):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        self.model.query_encoder(query_feat, query_mask)
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_query_encoding_time(self):
+        feat_dict = self.cast_dict_inputs_to_device(self.get_fake_raw_query(), self.device)
+        with torch.no_grad():
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_query_encoding_time(**feat_dict)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+
+class ProfileExCL(ProfileBase):
+    def get_model_config(self):
+        excl_base_cfg["ctx_mode"] = "video_sub"
+        excl_base_cfg["query_input_size"] = self.QueryFeatureDim
+        excl_base_cfg["visual_input_size"] = self.VideoFeatureDim
+        excl_base_cfg["sub_input_size"] = self.SubFeatureDim
+        excl_base_cfg["output_size"] = self.HiddenSize
+        return excl_base_cfg
+
+    def get_model(self):
+        model = EXCL(self.model_config)
+        model.to(self.device)
+        model.eval()
+        return model
+
+    def get_fake_raw_input(self):
+        """The features are `*2` since they use both global and local features"""
+        return dict(
+            query_feat=torch.FloatTensor(self.ctx_batch_size, self.AvgWordInQuery, self.QueryFeatureDim),
+            query_mask=torch.ones((self.ctx_batch_size, self.AvgWordInQuery)),
+            sub_feat=torch.FloatTensor(self.ctx_batch_size,  self.AvgClipPerVideo, self.SubFeatureDim),
+            sub_mask=torch.ones(self.ctx_batch_size,  self.AvgClipPerVideo),
+            video_feat=torch.FloatTensor(self.ctx_batch_size,  self.AvgClipPerVideo, self.VideoFeatureDim),
+            video_mask=torch.ones(self.ctx_batch_size,  self.AvgClipPerVideo),
+            tef_feat=torch.FloatTensor(self.ctx_batch_size,  self.AvgClipPerVideo, 2),
+            tef_mask=torch.ones(self.ctx_batch_size,  self.AvgClipPerVideo),
+            st_ed_indices=torch.ones(2, 2),  # not used.
+        )
+
+    def _get_prediction_time(self, input_dict):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        self.model(**input_dict)
+        torch.cuda.synchronize()
+        return time.time() - st_time
+
+    def get_prediction_time(self):
+        """model_name: str, `cal` or `mcn`"""
+        feat_dict = self.cast_dict_inputs_to_device(
+            self.get_fake_raw_input(), self.device)
+        feat_dict["is_training"] = False
+        with torch.no_grad():
+            times = []
+            for _ in trange(self.N_Runs):
+                times += [self._get_prediction_time(feat_dict)]
+            times = torch.FloatTensor(times)
+        return dict(avg=float(times.mean()), std=float(times.std()))
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="")
+    parser.add_argument("--ctx_batch_size", type=int, default=400)
+    parser.add_argument("--query_batch_size", type=int, default=100)
+    parser.add_argument("--save_dir", type=str, default="baselines/profiling/cache")
+    args = parser.parse_args()
+
+    model = args.model
+    query_batch_size = args.query_batch_size
+    ctx_batch_size = args.ctx_batch_size
+    if model == "mee":
+        profile_mee = ProfileMEE(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size)
+        # use the 2nd one to report time
+        profile_mee.get_ctx_encoding_time()
+        ctx_enc_time = profile_mee.get_ctx_encoding_time()
+        query_enc_time = profile_mee.get_query_encoding_time()
+    elif model == "cal":
+        profile_cal = ProfileCAL(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size)
+        # use the 2nd one to report time
+        profile_cal.get_ctx_encoding_time()
+        ctx_enc_time = profile_cal.get_ctx_encoding_time(model_name="cal")
+        query_enc_time = profile_cal.get_query_encoding_time()
+    elif model == "mcn":
+        profile_cal = ProfileCAL(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size)
+        # use the 2nd one to report time
+        profile_cal.get_ctx_encoding_time()
+        ctx_enc_time = profile_cal.get_ctx_encoding_time(model_name="mcn")
+        query_enc_time = profile_cal.get_query_encoding_time()
+    elif model == "xml":
+        profile_xml = ProfileXML(ctx_batch_size=ctx_batch_size, query_batch_size=query_batch_size)
+        # use the 2nd one to report time
+        profile_xml.get_ctx_encoding_time()
+        ctx_enc_time = profile_xml.get_ctx_encoding_time()
+        query_enc_time = profile_xml.get_query_encoding_time()
+    elif model == "excl":
+        profile_excl = ProfileExCL(ctx_batch_size=ctx_batch_size, query_batch_size=ctx_batch_size)
+        # use the 2nd one to report time
+        profile_excl.get_prediction_time()
+        ctx_enc_time = profile_excl.get_prediction_time()
+        query_enc_time = 0
+        # Calculate the total time as ctx_enc_time * (100 * 1M / ctx_batch_size)
+    else:
+        raise NotImplementedError
+    # ctx_enc_time = ctx_enc_time
+    save_path = os.path.join(args.save_dir, "{}_profile_main.json".format(model))
+
+    n_videos = ProfileBase.N_Videos
+    res = dict(
+        ctx_enc_time=ctx_enc_time,
+        ctx_enc_avg_time_all_videos=ctx_enc_time["avg"] * n_videos / ctx_batch_size,
+        query_enc_time=query_enc_time,
+        n_videos=n_videos,
+        ctx_batch_size=ctx_batch_size,
+        query_batch_size=query_batch_size,
+        model=model
+    )
+    save_json(res, save_path, save_pretty=True)
+    pprint.pprint(res)
diff --git a/baselines/profiling/profile_main.sh b/baselines/profiling/profile_main.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bf9235dfc1f7dd6a64079809ed250d832f7c75cf
--- /dev/null
+++ b/baselines/profiling/profile_main.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+model=$1
+ctx_batch_size=$2
+save_dir=baselines/profiling/cache
+
+python baselines/profiling/profile_main.py \
+--model ${model} \
+--ctx_batch_size ${ctx_batch_size} \
+--query_batch_size 100 \
+--save_dir ${save_dir}
+
diff --git a/baselines/profiling/search_time_performance.py b/baselines/profiling/search_time_performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed4b322e1098822d0c81014248469637f069e290
--- /dev/null
+++ b/baselines/profiling/search_time_performance.py
@@ -0,0 +1,318 @@
+"""
+Compute search time needed for searching 100 new queries in a corpus containing 1M videos.
+The performance reported is tested on 1.4.0.dev20191109 with Python3.7 and CUDA10.1.
+
+This experiment is simulated.
+"""
+
+import os
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from utils.basic_utils import save_json
+
+import logging
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+np.random.seed(1234)
+
+
+def compare_l2dist_inner_product_time(n_videos=2000, d=256, n_query=1000, n_runs=10, n_warmup_runs=10):
+    """In some PyTorch/Cuda Verison, torch.cdist is very slow, which affects this comparison.
+    See https://discuss.pytorch.org/t/cdist-vs-matmul/61682/5"""
+    torch.cuda.synchronize()
+    st_time = time.time()
+    fake_database = F.normalize(torch.randn((n_videos, d), dtype=torch.float32).cuda(), dim=1, p=2)
+    fake_query = F.normalize(torch.randn((n_query, d), dtype=torch.float32).cuda(), dim=1, p=2)
+    torch.cuda.synchronize()
+    print("Construct fake database + query time {}".format(time.time() - st_time))
+    print("fake_database shape {} fake_query shape {}".format(fake_database.shape, fake_query.shape))
+
+    times_l2dist = []
+    for _ in range(n_warmup_runs + n_runs):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        l2_dist = torch.cdist(fake_query, fake_database, p=2)  # (n_query, n_videos)
+        torch.cuda.synchronize()
+        times_l2dist.append(time.time() - st_time)
+    avg_time_l2dist = np.mean(times_l2dist[n_warmup_runs:])
+    print("L2 Distance time {}".format(avg_time_l2dist))
+
+    times_ip = []
+    fake_database = fake_database.transpose(0, 1)
+    for _ in range(n_warmup_runs + n_runs):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        inner_product = torch.mm(fake_query, fake_database)  # (n_query, n_videos)
+        torch.cuda.synchronize()
+        times_ip.append(time.time() - st_time)
+    avg_time_ip = np.mean(times_ip[n_warmup_runs:])
+    print("Inner Product time {}".format(avg_time_ip))
+
+
+def run_example():
+    """
+    In Python, the matrices are always represented as numpy arrays.
+    The data type dtype must be float32.
+    """
+    # --------------------------------
+    # Step 1: Get Data
+    # --------------------------------
+    import faiss
+    d = 64  # dimension
+    nb = 100000  # database size
+    nq = 10000  # nb of queries
+    np.random.seed(1234)  # make reproducible
+    xb = np.random.random((nb, d)).astype('float32')
+    xb[:, 0] += np.arange(nb) / 1000.
+    xq = np.random.random((nq, d)).astype('float32')
+    xq[:, 0] += np.arange(nq) / 1000.
+
+    # --------------------------------
+    # Step 2: Build `Index' object
+    # Note some of the indexes require a training phase to analyze the data distribution.
+    # --------------------------------
+    index = faiss.IndexFlatL2(d)  # build the index
+    print(index.is_trained)
+    index.add(xb)  # add vectors to the index
+    print(index.ntotal)
+
+    k = 4  # we want to see 4 nearest neighbors
+    D, I = index.search(xb[:5], k)  # sanity check
+    print(I)
+    print(D)
+    st_time = time.time()
+    D, I = index.search(xq, k)  # actual search
+    print("time elapsed {}".format(time.time() - st_time))
+    print(I[:5])  # neighbors of the 5 first queries
+    print(I[-5:])  # neighbors of the 5 last queries
+
+
+def simulate_mee_runtime(n_videos=1000000, d=256, n_query=100, max_neighbors=100, n_runs=5, n_warmup_runs=10):
+    """ Search over a database of shape [n_videos, d] with query of shape [n_query, d].
+    For each query, return max_neighbors results.
+    """
+    import faiss
+    torch.cuda.synchronize()
+    st_time = time.time()
+    fake_database = faiss.rand((n_videos, d))
+    fake_query = faiss.rand((n_query, d))
+    torch.cuda.synchronize()
+    logger.info("Construct fake database + query time {}".format(time.time() - st_time))
+
+    torch.cuda.synchronize()
+    st_time = time.time()
+    index = faiss.index_factory(d, "IVF4096,Flat", faiss.METRIC_L2)
+    index_ivf = faiss.extract_index_ivf(index)
+    clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+    index_ivf.clustering_index = clustering_index
+    torch.cuda.synchronize()
+    logger.info("Build/Move to GPU? index time {}".format(time.time() - st_time))
+
+    st_time = time.time()
+    torch.cuda.synchronize()
+    index_ivf.train(fake_database)
+    torch.cuda.synchronize()
+    logger.info("Train index time {}".format(time.time() - st_time))
+
+    times = []
+    for _ in range(n_warmup_runs+n_runs):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        D, I = index_ivf.search(fake_query, max_neighbors)
+        torch.cuda.synchronize()
+        times.append(time.time() - st_time)
+    avg_time = np.mean(times[n_warmup_runs:]) * 2  # video + sub
+    logger.info("Avg searching time ({} runs) {}".format(n_runs, avg_time))
+    return avg_time
+
+
+def simulate_cal_rerank_time(n_moments=200, avg_n_clips_per_moment=7, d=256, n_query=100, max_neighbors=100,
+                             n_runs=5, n_warmup_runs=10):
+    st_time = time.time()
+    torch.cuda.synchronize()
+    fake_database = torch.randn((n_moments * avg_n_clips_per_moment, d), dtype=torch.float32).cuda()
+    fake_query = torch.randn((n_query, d), dtype=torch.float32).cuda()
+    torch.cuda.synchronize()
+    logger.info("Construct fake database + query time {}".format(time.time() - st_time))
+
+    times = []
+    for _ in range(n_warmup_runs+n_runs):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        fake_dist = torch.cdist(fake_query, fake_database, p=2)
+        fake_dist = fake_dist.view(n_query, n_moments, avg_n_clips_per_moment).mean(2)
+        fake_dist = torch.cdist(fake_query, fake_database, p=2)
+        fake_dist = fake_dist.view(n_query, n_moments, avg_n_clips_per_moment).mean(2)  # video + sub
+        fake_dist = fake_dist + fake_dist
+        fake_top_indices, fake_top_dist = torch.topk(fake_dist, k=max_neighbors, dim=1, largest=False, sorted=True)
+        torch.cuda.synchronize()
+        times.append(time.time() - st_time)
+    avg_time = np.mean(times[n_warmup_runs:])
+    logger.info("searching time {}".format(avg_time))
+    return avg_time
+
+
+def simulate_mcn_rerank_time(n_moments=200, d=256, n_query=100, max_neighbors=100, n_runs=5, n_warmup_runs=10):
+    torch.cuda.synchronize()
+    st_time = time.time()
+    fake_database = torch.randn((n_moments, d), dtype=torch.float32).cuda()
+    fake_query = torch.randn((n_query, d), dtype=torch.float32).cuda()
+    torch.cuda.synchronize()
+    logger.info("Construct fake database + query time {}".format(time.time() - st_time))
+
+    times = []
+    for _ in range(n_warmup_runs+n_runs):
+        torch.cuda.synchronize()
+        st_time = time.time()
+        fake_dist = torch.cdist(fake_query, fake_database, p=2).view(n_query, n_moments)
+        fake_dist = torch.cdist(fake_query, fake_database, p=2).view(n_query, n_moments)  # video + sub
+        fake_dist = fake_dist + fake_dist
+        fake_top_indices, fake_top_dist = torch.topk(fake_dist, k=max_neighbors, dim=1, largest=False, sorted=True)
+        torch.cuda.synchronize()
+        times.append(time.time() - st_time)
+    avg_time = np.mean(times[n_warmup_runs:])  #
+    logger.info("searching time {}".format(avg_time))
+    return avg_time
+
+
+def simulate_xml_rerank_time(n_videos=100, avg_n_clips_per_video=20, d=256, n_query=100, max_neighbors=100,
+                             n_runs=5, n_warmup_runs=10):
+    torch.cuda.synchronize()
+    st_time = time.time()
+    fake_database = torch.randn((d, n_videos*avg_n_clips_per_video), dtype=torch.float32).cuda()
+    fake_query = torch.randn((n_query, d), dtype=torch.float32).cuda()
+    conv = nn.Conv1d(in_channels=1, out_channels=2, kernel_size=5, stride=1, padding=2, bias=False).cuda()
+    torch.cuda.synchronize()
+    logger.info("Construct fake database + query time {}".format(time.time() - st_time))
+
+    times = dict(
+        conv=[],
+        prod=[],
+        topk=[],
+        triu=[]
+    )
+    for _ in range(n_warmup_runs+n_runs):
+        torch.cuda.synchronize()
+        st_time = time.time()  # [100, 256] [100, 20, 256]
+        fake_dist = torch.mm(fake_query, fake_database).view(n_query*n_videos, -1)
+        fake_dist = torch.mm(fake_query, fake_database).view(n_query * n_videos, -1)  # video + sub
+        fake_dist = fake_dist + fake_dist
+        torch.cuda.synchronize()
+        times["prod"].append(time.time() - st_time)
+        torch.cuda.synchronize()
+        st_time = time.time()
+        fake_dist = conv(fake_dist.unsqueeze(1))[:, 0, :]
+        torch.cuda.synchronize()
+        times["conv"].append(time.time() - st_time)
+        torch.cuda.synchronize()
+        st_time = time.time()
+        fake_prob_prod = torch.triu(torch.einsum("ns,ne->nse", fake_dist, fake_dist)).view(n_query, -1)
+        torch.cuda.synchronize()
+        times["triu"].append(time.time() - st_time)
+        torch.cuda.synchronize()
+        st_time = time.time()
+        fake_top_indices, fake_top_dist = torch.topk(fake_prob_prod, k=max_neighbors, dim=1, largest=True, sorted=True)
+        torch.cuda.synchronize()
+        times["topk"].append(time.time() - st_time)
+    avg_time = {k: np.mean(times[k][n_warmup_runs:]) for k in times}
+    avg_time["all"] = np.sum(list(avg_time.values()))
+    logger.info("searching time {}".format(avg_time))
+    return avg_time
+
+
+def get_storage_size(hsz, n_videos, n_clips_per_video, n_moments, n_total_clips_in_moments, dtype_size=4):
+    """dtype_size: float32, 4B"""
+    GB = 1024**3
+    # multiply by 2 for video+sub, xml has two level, so it has an additional 2 to multiply by.
+    storage = dict(
+        mee=n_videos * hsz * dtype_size * 2. / GB,
+        cal=n_total_clips_in_moments * hsz * dtype_size * 2. / GB,
+        mcn=n_moments * hsz * dtype_size * 2. / GB,
+        xml=n_videos * n_clips_per_video * hsz * dtype_size * 2. * 2. / GB
+    )
+    print("storage (GB) {}".format(storage))
+    return storage
+
+
+def main_run():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", type=str, default="mee", help="which models to simulate")
+    parser.add_argument("--cache_dir", type=str, default="baselines/profiling/cache", help="save index/results path")
+    parser.add_argument("--n_runs", type=int, default=100, help="number of runs to calc average")
+    parser.add_argument("--n_warmup_runs", type=int, default=10, help="number of warmup runs, to init cuda, etc.")
+    args = parser.parse_args()
+
+    """
+    The numbers are get from the first author of 
+    `Temporal Localization of Moments in Video Collections with Natural Language`
+    """
+    k = 100
+    n_query = 100
+    n_videos = 1000000
+    n_moments_per_video = 170
+    hsz = 256
+    n_clips_per_video = 20
+    n_total_clips_in_moments = 1170946944
+    n_moments = 170000000
+    max_clips_per_proposal = 14  # assume padding to this number
+    avg_clips_per_proposal = 7  # 6.88
+
+    mode = args.mode
+    cfg_path = os.path.join(args.cache_dir, "{}_args.json".format(mode))
+
+    n_runs = args.n_runs
+    n_warmup_runs = args.n_warmup_runs
+    torch.set_grad_enabled(False)
+    if mode in ["mee", "mee_torch"]:
+        func_args = dict(n_videos=n_videos, d=hsz, n_query=n_query, max_neighbors=k,
+                         n_runs=n_runs, n_warmup_runs=n_warmup_runs)
+        avg_time = simulate_mee_runtime(**func_args)
+    elif mode == "xml_vr":
+        func_args = dict(n_videos=n_videos*n_clips_per_video, d=hsz, n_query=n_query,
+                         max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs)
+        avg_time = simulate_mee_runtime(**func_args)
+    elif mode == "cal":
+        # can only use n_query <= 4000, so use 4000. To get 20000, simply x5 the final time.
+        n_cal_rerank_videos = 100
+        func_args = dict(n_moments=n_cal_rerank_videos*n_moments_per_video,
+                         avg_n_clips_per_moment=avg_clips_per_proposal,
+                         d=hsz, n_query=n_query, max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs)
+        avg_time = simulate_cal_rerank_time(**func_args)
+    elif mode == "mcn":
+        n_cal_rerank_videos = 100
+        func_args = dict(n_moments=n_cal_rerank_videos*n_moments_per_video, d=hsz, n_query=n_query,
+                         max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs)
+        avg_time = simulate_mcn_rerank_time(**func_args)
+    elif mode == "xml":
+        n_xml_videos = 100
+        func_args = dict(n_videos=n_xml_videos, avg_n_clips_per_video=n_clips_per_video,
+                         d=hsz, n_query=n_query, max_neighbors=k, n_runs=n_runs, n_warmup_runs=n_warmup_runs)
+        avg_time = simulate_xml_rerank_time(**func_args)
+    elif mode == "storage":
+        func_args = dict(hsz=hsz, n_videos=n_videos, n_clips_per_video=n_clips_per_video,
+                         n_moments=n_moments, n_total_clips_in_moments=n_total_clips_in_moments, dtype_size=4)
+        storage = get_storage_size(**func_args)
+    else:
+        raise NotImplementedError
+
+    if mode == "storage":
+        func_args["storage"] = storage
+    else:
+        func_args["n_runs"] = args.n_runs
+        func_args["avg_time"] = avg_time
+    func_args["mode"] = mode
+    print(func_args)
+    save_json(func_args, cfg_path, save_pretty=True)
+
+
+if __name__ == '__main__':
+    main_run()
+    # compare_l2dist_inner_product_time()
diff --git a/baselines/profiling/search_time_performance.sh b/baselines/profiling/search_time_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4a33510bee6aa13f01d081123aec7892e50334a3
--- /dev/null
+++ b/baselines/profiling/search_time_performance.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+mode=$1
+#dt=$(date '%Y_%m_%d_%H_%M_%S');
+#echo "$dt"
+python baselines/profiling/search_time_performance.py \
+--mode ${mode} \
+--cache_dir baselines/profiling/cache
+
+#| tee baselines/profiling/cache/${mode}_${dt}.log
\ No newline at end of file
diff --git a/standalone_eval/__init__.py b/standalone_eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/standalone_eval/eval.py b/standalone_eval/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..164a18e0aa40e7973cade49f4f4dd24fcf497678
--- /dev/null
+++ b/standalone_eval/eval.py
@@ -0,0 +1,300 @@
+"""
+Load prediction file and GT file to calculate TVR metrics:
+- recall at top K (R@K), for a specified IoU, where K in [1, 5, 10, 100], IoU in [0.5, 0.7]
+"""
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict, defaultdict
+
+
+def load_json(filename):
+    with open(filename, "r") as f:
+        return json.load(f)
+
+
+def load_jsonl(filename):
+    with open(filename, "r") as f:
+        return [json.loads(l.strip("\n")) for l in f.readlines()]
+
+
+def pad_sequences_1d_np(sequences, dtype=np.float32):
+
+    """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
+    into a (n+1)-d array, only allow the first dim has variable lengths.
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: np.dtype or torch.dtype
+    Returns:
+        padded_seqs: ((n+1)-d tensor) padded with zeros
+        mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
+              1 indicate valid, 0 otherwise
+    Examples:
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=np.float32)
+        >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=np.float32)
+    """
+    if isinstance(sequences[0], list):
+        sequences = [np.asarray(s, dtype=dtype) for s in sequences]
+
+    extra_dims = sequences[0].shape[1:]  # the extra dims should be the same for all elements
+    lengths = [len(seq) for seq in sequences]
+    assert "numpy" in str(dtype), "dtype and input type does not match"
+    padded_seqs = np.zeros((len(sequences), max(lengths)) + extra_dims, dtype=dtype)
+    mask = np.zeros((len(sequences), max(lengths)), dtype=np.float32)
+
+    for idx, seq in enumerate(sequences):
+        end = lengths[idx]
+        padded_seqs[idx, :end] = seq
+        mask[idx, :end] = 1
+    return padded_seqs, mask
+
+
+def compute_temporal_iou_batch(preds, gt):
+    """ compute intersection-over-union along temporal axis
+    This function is significantly faster than `compute_temporal_iou`,
+    the result should be the same.
+    Args:
+        preds: np.ndarray, (N, 2), [st (float), ed (float)] * N
+        gt: [st (float), ed (float)]
+    Returns:
+        iou (float): np.ndarray, (N, )
+
+    References:
+        for np.divide with zeros, see https://stackoverflow.com/a/37977222
+    """
+    intersection = np.maximum(0, np.minimum(preds[:, 1], gt[1]) - np.maximum(preds[:, 0], gt[0]))
+    union = np.maximum(preds[:, 1], gt[1]) - np.minimum(preds[:, 0], gt[0])  # not the correct union though
+    return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0)
+
+
+def get_rounded_percentage(float_number, n_floats=2):
+    return round(float_number * 100, n_floats)
+
+
+TASK_TYPES = OrderedDict([
+    ("VCMR", "Video Corpus Moment Retrieval"),
+    ("SVMR", "Single Video Moment Retrieval"),
+    ("VR", "regular Video Retrieval")
+])
+
+
+def eval_by_task_type(moment_predictions, video2idx, ground_truth,
+                     iou_thds=(0.5, 0.7), recall_topks=(1, 5, 10, 100),
+                     task_type="SVMR", max_pred_per_query=100, match_number=True, verbose=True, use_desc_type=True):
+    """ a predicted triplet is positive only if:
+    1) its vid_name matches the GT vid_name
+    2) IoU between its timestamp and GT timestamp is higher than the given threshold
+
+    moment_predictions w.r.t. different task_type:
+        For each query, evaluated on top max_pred_per_query [vid_name, st, ed] triplets. (score entry ignored)
+        VCMR: vid_name might be repeating.
+        SVMR: vid_name is fixed to be the GT vid_name.
+        VR: vid_name is not repeating, st and ed will not be used.
+
+    Args:
+        video2idx: {vid_name (str): index (int), ...}
+        moment_predictions: list(dict), each dict is {
+            "desc": str,
+            "query_id": int,
+            "predictions": [vid_name_idx (int), st (float), ed (float), score (float)] * n_pred,
+                sorted predictions, n_pred could be different for all dicts. For each prediction,
+                only the first 3 elements [vid_name (str), st (float), ed (float),] are used,
+                any other following elements are ignored. We leave score here for record.
+        }
+        ground_truth: list(dict), each dict is {
+            "desc": str,
+            "query_id": int,
+            "type": str, one of [v, t, vt]
+            "vid_name": str
+            "ts": [st (float), ed (float)], or list([st (float), ed (float)]), len == 4.
+            ...
+        }
+        iou_thds: temporal IoU thresholds
+        recall_topks: recall at different top k
+        task_type: str, could be: ["VCMR", "SVMR", "VR"], see TASK_TYPES for definition.
+        max_pred_per_query: int, only top max_pred_per_query predictions for each query are used.
+        match_number: bool, must set to True if when do evaluation, False is only used for debug.
+        verbose:
+        use_desc_type: only TVR has desc type
+    Returns:
+
+    """
+    assert task_type in TASK_TYPES, "task_type must be one of {}".format(list(TASK_TYPES.keys()))
+    if verbose:
+        print("Running evaluation with task_type {}, n results {}; n gt {}"
+              .format(task_type, len(moment_predictions), len(ground_truth)))
+
+    predictions_by_query_id = {e["query_id"]: e for e in moment_predictions}
+    gt_by_query_id = {e["query_id"]: e for e in ground_truth}
+    desc_type2idx = {"v": 0, "t": 1, "vt": 2}
+    desc_types = []  # n_desc
+
+    if match_number:
+        assert set(gt_by_query_id.keys()) == set(predictions_by_query_id.keys()), \
+            "query_ids in predictions and ground_truth must match"
+    # assert len(set([len(e["predictions"]) for e in predictions_by_query_id.values()])) == 1, \
+    #     "all queries must have the same number of predictions"
+
+    pred_info_matrix_collection = []
+    for k, gt_item in tqdm(gt_by_query_id.items(), desc="Loop over moments", leave=False):
+        if not match_number and k not in predictions_by_query_id:
+            continue
+        pred_info_matrix = np.array(
+            [e[:3] for e in predictions_by_query_id[k]["predictions"]][:max_pred_per_query],
+            dtype=np.float32)  # (n_pred, 3)
+        if use_desc_type:
+            desc_types.append(desc_type2idx[gt_item["type"]])
+        vid_name_matched_pred = pred_info_matrix[:, 0] == video2idx[gt_item["vid_name"]]  # bool, (n_pred, )
+        pred_info_matrix = np.concatenate([pred_info_matrix, vid_name_matched_pred[:, None]], axis=1)  # (n_pred, 4)
+
+        # add 1 + len(iou_thds) columns, iou_scores, iou_corrects for each iou_thd.
+        iou_thd_corrects_columns = []
+        if len(gt_item["ts"]) >= 4:  # didemo, fro all 3 splits, at least 4 ts for each, < 0.5% has more than 4.
+            least_n_overlap = 2  # True if overlapped with at least least_n_overlap GT ts.
+            iou_corrects_dict = defaultdict(list)
+            for single_gt_ts in gt_item["ts"]:
+                single_gt_ts = np.array(single_gt_ts, dtype=np.float32)  # (2, )
+                # iou scores of the predictions that have wrong vid_name are set to 0.
+                iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
+                for iou_thd in iou_thds:
+                    iou_corrects_dict[iou_thd].append(iou_scores >= iou_thd)
+            for iou_thd in iou_thds:
+                iou_corrects = sum(iou_corrects_dict[iou_thd]) >= least_n_overlap  # bool, (n_pred, )
+                iou_thd_corrects_columns.append(iou_corrects[:, None])
+
+        else:  # should be 2, len([st, ed]) == 2
+            single_gt_ts = np.array(gt_item["ts"], dtype=np.float32)  # (2, )
+            # iou scores of the predictions that have wrong vid_name are set to 0.
+            iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
+
+            for iou_thd in iou_thds:
+                iou_corrects = iou_scores >= iou_thd  # bool, (n_pred, )
+                iou_thd_corrects_columns.append(iou_corrects[:, None])
+
+        pred_info_matrix = np.concatenate([pred_info_matrix, ] + iou_thd_corrects_columns, axis=1)  # (n_pred, 6)
+        pred_info_matrix_collection.append(pred_info_matrix)
+
+    # column header [vid_name_idx (int), st (float), ed (float), is_vid_name_match (bool),
+    # iou_scores>=iou_thd0 (bool), iou_scores>=iou_thd1 (bool)]
+    pred_info_matrix_collection = pad_sequences_1d_np(pred_info_matrix_collection)[0]  # (n_desc, n_pred, 6)
+    if use_desc_type:
+        desc_types = np.array(desc_types)  # (n_desc)
+
+    # results wrapper
+    metrics = OrderedDict()
+    metrics_by_type = OrderedDict()
+
+    iou_c_offset = 4  # iou_corrects column index starts here
+    if task_type == "VCMR":
+        for iou_idx, iou_thd in enumerate(iou_thds):
+            iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)  # (n_desc, n_pred)
+            # 1) there might be more than one positive clip, so use `>= 1`
+            for k in recall_topks:
+                metrics["{}-r{}".format(iou_thd, k)] = \
+                    get_rounded_percentage(np.mean(np.sum(iou_corrects[:, :k], axis=1) >= 1))
+        if use_desc_type:
+            for desc_type in desc_type2idx:
+                type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
+                n_desc_in_type = np.sum(type_corrects)  # (n_desc)
+                for iou_idx, iou_thd in enumerate(iou_thds):
+                    # (n_desc, n_pred)
+                    iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)
+                    for k in recall_topks:
+                        metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
+                            1.0 * np.sum(np.logical_and(np.sum(iou_corrects[:, :k], axis=1) >= 1, type_corrects))
+                            / n_desc_in_type
+                        )
+    elif task_type == "SVMR":
+        vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool)  # (n_desc, n_pred)
+        n_desc = len(vid_name_matched)
+        for iou_idx, iou_thd in enumerate(iou_thds):
+            iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)  # (n_desc, n_pred)
+            # 1) there might be more than one positive clip, so use `>= 1`
+            for k in recall_topks:
+                metrics["{}-r{}".format(iou_thd, k)] = get_rounded_percentage(np.mean(
+                    [np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 for idx in range(n_desc)]
+                ))
+        if use_desc_type:
+            for desc_type in desc_type2idx:
+                type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
+                n_desc_in_type = np.sum(type_corrects)  # (n_desc)
+                for iou_idx, iou_thd in enumerate(iou_thds):
+                    # (n_desc, n_pred)
+                    iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)
+                    # 1) there might be more than one positive clip, so use `>= 1`
+                    for k in recall_topks:
+                        metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
+                            1.0 * np.sum([np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 and type_corrects[idx]
+                                         for idx in range(n_desc)])
+                            / n_desc_in_type)
+
+    elif task_type == "VR":
+        vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool)  # (n_desc, n_pred)
+        for k in recall_topks:
+            metrics["r{}".format(k)] = \
+                get_rounded_percentage(np.mean(np.sum(vid_name_matched[:, :k], axis=1) >= 1))
+        if use_desc_type:
+            for desc_type in desc_type2idx:
+                type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
+                n_desc_in_type = np.sum(type_corrects)  # (n_desc)
+                for k in recall_topks:
+                    metrics_by_type["{}-r{}".format(desc_type, k)] = get_rounded_percentage(
+                        1.0 * np.sum(np.logical_and(np.sum(vid_name_matched[:, :k], axis=1) >= 1, type_corrects))
+                        / n_desc_in_type)
+    else:
+        raise ValueError("task_type wrong.")
+    if use_desc_type:
+        metrics_by_type["desc_type_ratio"] = "v {} t {} vt {}"\
+            .format(*[get_rounded_percentage(1.0 * np.sum(desc_types == desc_type2idx[k]) / len(desc_types))
+                      for k in ["v", "t", "vt"]])
+    return metrics, metrics_by_type
+
+
+def eval_retrieval(submission, ground_truth, iou_thds=(0.5, 0.7), verbose=True, match_number=True, use_desc_type=True):
+    video2idx = submission["video2idx"]
+    submitted_task_types = [k for k in TASK_TYPES if k in submission]
+    if verbose:
+        print("Evaluating for task {}".format(submitted_task_types))
+    eval_metrics = OrderedDict()
+    metrics_raw_dict = {}
+    for task_type in submitted_task_types:
+        metrics, metrics_by_type = eval_by_task_type(
+            submission[task_type], video2idx, ground_truth,
+            iou_thds=iou_thds, recall_topks=(1, 5, 10, 100),
+            task_type=task_type, max_pred_per_query=100,
+            match_number=match_number, verbose=verbose, use_desc_type=use_desc_type)
+        metrics_raw_dict[task_type] = metrics
+        metrics_raw_dict[task_type+"_by_type"] = metrics_by_type
+
+    for task_type in submitted_task_types:
+        eval_metrics[task_type] = metrics_raw_dict[task_type]
+    if use_desc_type:
+        for task_type in submitted_task_types:
+            eval_metrics[task_type+"_by_type"] = metrics_raw_dict[task_type+"_by_type"]
+    return eval_metrics
+
+
+def eval_main():
+    import argparse
+    parser = argparse.ArgumentParser(description="TVR Evaluation Script")
+    parser.add_argument("--submission_path", type=str, help="path to generated prediction file")
+    parser.add_argument("--gt_path", type=str, help="path to GT file")
+    parser.add_argument("--save_path", type=str, help="path to save the results")
+    parser.add_argument("--not_verbose", action="store_true")
+    args = parser.parse_args()
+
+    verbose = not args.not_verbose
+    submission = load_json(args.submission_path)
+    gt = load_jsonl(args.gt_path)
+    results = eval_retrieval(submission, gt, iou_thds=(0.5, 0.7), verbose=verbose)
+    if verbose:
+        print(json.dumps(results, indent=4))
+
+    with open(args.save_path, "w") as f:
+        f.write(json.dumps(results, indent=4))
+
+
+if __name__ == '__main__':
+    eval_main()
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utils/basic_utils.py b/utils/basic_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b319182fb7136058290bc3a9e2309fbe01c0db6
--- /dev/null
+++ b/utils/basic_utils.py
@@ -0,0 +1,206 @@
+import os
+import json
+import zipfile
+import numpy as np
+import pickle
+
+
+def load_pickle(filename):
+    with open(filename, "rb") as f:
+        return pickle.load(f)
+
+
+def save_pickle(data, filename):
+    with open(filename, "wb") as f:
+        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+def load_json(filename):
+    with open(filename, "r") as f:
+        return json.load(f)
+
+
+def save_json(data, filename, save_pretty=False, sort_keys=False):
+    with open(filename, "w") as f:
+        if save_pretty:
+            f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
+        else:
+            json.dump(data, f)
+
+
+def load_jsonl(filename):
+    with open(filename, "r") as f:
+        return [json.loads(l.strip("\n")) for l in f.readlines()]
+
+
+def save_jsonl(data, filename):
+    """data is a list"""
+    with open(filename, "w") as f:
+        f.write("\n".join([json.dumps(e) for e in data]))
+
+
+def save_lines(list_of_str, filepath):
+    with open(filepath, "w") as f:
+        f.write("\n".join(list_of_str))
+
+
+def read_lines(filepath):
+    with open(filepath, "r") as f:
+        return [e.strip("\n") for e in f.readlines()]
+
+
+def mkdirp(p):
+    if not os.path.exists(p):
+        os.makedirs(p)
+
+
+def flat_list_of_lists(l):
+    """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]"""
+    return [item for sublist in l for item in sublist]
+
+
+def convert_to_seconds(hms_time):
+    """ convert '00:01:12' to 72 seconds.
+    :hms_time (str): time in comma separated string, e.g. '00:01:12'
+    :return (int): time in seconds, e.g. 72
+    """
+    times = [float(t) for t in hms_time.split(":")]
+    return times[0] * 3600 + times[1] * 60 + times[2]
+
+
+def get_video_name_from_url(url):
+    return url.split("/")[-1][:-4]
+
+
+def merge_dicts(list_dicts):
+    merged_dict = list_dicts[0].copy()
+    for i in range(1, len(list_dicts)):
+        merged_dict.update(list_dicts[i])
+    return merged_dict
+
+
+def l2_normalize_np_array(np_array, eps=1e-5):
+    """np_array: np.ndarray, (*, D), where the last dim will be normalized"""
+    return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps)
+
+
+def make_zipfile(src_dir, save_path, enclosing_dir="", exclude_dirs=None, exclude_extensions=None,
+                 exclude_dirs_substring=None):
+    """make a zip file of root_dir, save it to save_path.
+    exclude_paths will be excluded if it is a subdir of root_dir.
+    An enclosing_dir is added is specified.
+    """
+    abs_src = os.path.abspath(src_dir)
+    with zipfile.ZipFile(save_path, "w") as zf:
+        for dirname, subdirs, files in os.walk(src_dir):
+            if exclude_dirs is not None:
+                for e_p in exclude_dirs:
+                    if e_p in subdirs:
+                        subdirs.remove(e_p)
+            if exclude_dirs_substring is not None:
+                to_rm = []
+                for d in subdirs:
+                    if exclude_dirs_substring in d:
+                        to_rm.append(d)
+                for e in to_rm:
+                    subdirs.remove(e)
+            arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:])
+            zf.write(dirname, arcname)
+            for filename in files:
+                if exclude_extensions is not None:
+                    if os.path.splitext(filename)[1] in exclude_extensions:
+                        continue  # do not zip it
+                absname = os.path.join(dirname, filename)
+                arcname = os.path.join(enclosing_dir, absname[len(abs_src) + 1:])
+                zf.write(absname, arcname)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current/max/min value"""
+    def __init__(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.max = -1e10
+        self.min = 1e10
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.max = -1e10
+        self.min = 1e10
+
+    def update(self, val, n=1):
+        self.max = max(val, self.max)
+        self.min = min(val, self.min)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True):
+    """Dissect an array (N, D) into a list a sub-array,
+    np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept"""
+    if assert_equal:
+        assert len(np_array) == sum(lengths)
+    length_indices = [0, ]
+    for i in range(len(lengths)):
+        length_indices.append(length_indices[i] + lengths[i])
+    if dim == 0:
+        array_list = [np_array[length_indices[i]:length_indices[i+1]] for i in range(len(lengths))]
+    elif dim == 1:
+        array_list = [np_array[:, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
+    elif dim == 2:
+        array_list = [np_array[:, :, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
+    else:
+        raise NotImplementedError
+    return array_list
+
+
+import time
+import logging
+import os
+
+def get_logger(dir, tile):
+    os.makedirs(dir, exist_ok=True)
+    log_file = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+    log_file = os.path.join(dir, "{}_{}.log".format(log_file, tile))
+
+    logger = logging.getLogger()
+    logger.setLevel('DEBUG')
+    BASIC_FORMAT = "%(levelname)s:%(message)s"
+    # DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
+    formatter = logging.Formatter(BASIC_FORMAT)
+    chlr = logging.StreamHandler()
+    chlr.setFormatter(formatter)
+
+    fhlr = logging.FileHandler(log_file) 
+    fhlr.setFormatter(formatter)
+    fhlr.setLevel('INFO') 
+
+    logger.addHandler(chlr)
+    logger.addHandler(fhlr)
+    return logger
+
+def get_ratio_from_counter(counter_obj, threshold=200):
+    keys = counter_obj.keys()
+    values = counter_obj.values()
+    filtered_values = [counter_obj[k] for k in keys if k > threshold]
+    return float(sum(filtered_values)) / sum(values)
+
+
+def get_show_name(vid_name):
+    """
+    get tvshow name from vid_name
+    :param vid_name: video clip name
+    :return: tvshow name
+    """
+    show_list = ["friends", "met", "castle", "house", "grey"]
+    vid_name_prefix = vid_name.split("_")[0]
+    show_name = vid_name_prefix if vid_name_prefix in show_list else "bbt"
+    return show_name
diff --git a/utils/find_best_epoch.py b/utils/find_best_epoch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d13277255212e453cd1a63f37b6df0d4ce108ba
--- /dev/null
+++ b/utils/find_best_epoch.py
@@ -0,0 +1,21 @@
+def rewrite_epoch(filename, new_file_name):
+    max_value = float(-100)
+    new_file = []
+    
+    with open(filename, 'r') as file:
+        for line in file:
+            new_file.append(line)
+            if line.startswith("INFO:VAL"):
+                anchor = float(line.split()[5]) # Assuming the value is at the 5th index
+                if anchor > max_value:
+                    max_value = anchor
+                    print(max_value)
+                    new_file.append("BEST: " + line)
+    
+    with open(new_file_name, 'w') as file:
+        file.writelines(new_file)
+
+# Example usage
+filename = "results/XML_top40_20240704_170747/20240704_170747_XML_top40.log"
+new_file_name = "results/XML_top40_20240704_170747/new.log"
+best_epoch = rewrite_epoch(filename, new_file_name)
diff --git a/utils/mk_video_split_with_duration.py b/utils/mk_video_split_with_duration.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab5a524174febeb4515e511dc33c10a74c212d84
--- /dev/null
+++ b/utils/mk_video_split_with_duration.py
@@ -0,0 +1,18 @@
+from utils.basic_utils import load_json, save_json
+
+
+def combine(video_name_split_path, video_duration_path, save_path):
+    video_name_split = load_json(video_name_split_path)
+    video_duration_dict = load_json(video_duration_path)
+
+    combined_dict = {}
+    for split_name, split_video_names in video_name_split.items():
+        combined_dict[split_name] = {vid_name: video_duration_dict[vid_name]
+                                     for vid_name in split_video_names}
+    save_json(combined_dict, save_path)
+
+
+if __name__ == '__main__':
+    import sys
+    combine(*sys.argv[1:])
+
diff --git a/utils/model_utils.py b/utils/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ef498bc7005bfa162047ba3a2c49196a95017f
--- /dev/null
+++ b/utils/model_utils.py
@@ -0,0 +1,105 @@
+__author__ = "Jie Lei"
+
+#  ref: https://github.com/lichengunc/MAttNet/blob/master/lib/layers/lang_encoder.py#L11
+#  ref: https://github.com/easonnie/flint/blob/master/torch_util.py#L272
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+
+class RNNEncoder(nn.Module):
+    """A RNN wrapper handles variable length inputs, always set batch_first=True.
+    Supports LSTM, GRU and RNN. Tested with PyTorch 0.3 and 0.4
+    """
+    def __init__(self, word_embedding_size, hidden_size, bidirectional=True,
+                 dropout_p=0, n_layers=1, rnn_type="lstm",
+                 return_hidden=True, return_outputs=True,
+                 allow_zero=False):
+        super(RNNEncoder, self).__init__()
+        """  
+        :param word_embedding_size: rnn input size
+        :param hidden_size: rnn output size
+        :param dropout_p: between rnn layers, only useful when n_layer >= 2
+        """
+        self.allow_zero = allow_zero
+        self.rnn_type = rnn_type
+        self.n_dirs = 2 if bidirectional else 1
+        # - add return_hidden keyword arg to reduce computation if hidden is not needed.
+        self.return_hidden = return_hidden
+        self.return_outputs = return_outputs
+        self.rnn = getattr(nn, rnn_type.upper())(word_embedding_size, hidden_size, n_layers,
+                                                 batch_first=True,
+                                                 bidirectional=bidirectional,
+                                                 dropout=dropout_p)
+
+    def sort_batch(self, seq, lengths):
+        sorted_lengths, perm_idx = lengths.sort(0, descending=True)
+        if self.allow_zero:  # deal with zero by change it to one.
+            sorted_lengths[sorted_lengths == 0] = 1
+        reverse_indices = [0] * len(perm_idx)
+        for i in range(len(perm_idx)):
+            reverse_indices[perm_idx[i]] = i
+        sorted_seq = seq[perm_idx]
+        return sorted_seq, list(sorted_lengths), reverse_indices
+
+    def forward(self, inputs, lengths):
+        """
+        inputs, sorted_inputs -> (B, T, D)
+        lengths -> (B, )
+        outputs -> (B, T, n_dirs * D)
+        hidden -> (n_layers * n_dirs, B, D) -> (B, n_dirs * D)  keep the last layer
+        - add total_length in pad_packed_sequence for compatiblity with nn.DataParallel, --remove it
+        """
+        assert len(inputs) == len(lengths)
+        sorted_inputs, sorted_lengths, reverse_indices = self.sort_batch(inputs, lengths)
+        packed_inputs = pack_padded_sequence(sorted_inputs, sorted_lengths, batch_first=True)
+        outputs, hidden = self.rnn(packed_inputs)
+        if self.return_outputs:
+            # outputs, lengths = pad_packed_sequence(outputs, batch_first=True, total_length=int(max(lengths)))
+            outputs, lengths = pad_packed_sequence(outputs, batch_first=True)
+            outputs = outputs[reverse_indices]
+        else:
+            outputs = None
+        if self.return_hidden:  #
+            if self.rnn_type.lower() == "lstm":
+                hidden = hidden[0]
+            hidden = hidden[-self.n_dirs:, :, :]
+            hidden = hidden.transpose(0, 1).contiguous()
+            hidden = hidden.view(hidden.size(0), -1)
+            hidden = hidden[reverse_indices]
+        else:
+            hidden = None
+        return outputs, hidden
+
+
+def pool_across_time(outputs, lengths, pool_type="max"):
+    """ Get maximum responses from RNN outputs along time axis
+    :param outputs: (B, T, D)
+    :param lengths: (B, )
+    :param pool_type: str, 'max' or 'mean'
+    :return: (B, D)
+    """
+    if pool_type == "max":
+        outputs = [outputs[i, :int(lengths[i]), :].max(dim=0)[0] for i in range(len(lengths))]
+    elif pool_type == "mean":
+        outputs = [outputs[i, :int(lengths[i]), :].mean(dim=0) for i in range(len(lengths))]
+    else:
+        raise NotImplementedError("Only support mean and max pooling")
+    return torch.stack(outputs, dim=0)
+
+
+def count_parameters(model, verbose=True):
+    """Count number of parameters in PyTorch model,
+    References: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7.
+
+    from utils.utils import count_parameters
+    count_parameters(model)
+    import sys
+    sys.exit(1)
+    """
+    n_all = sum(p.numel() for p in model.parameters())
+    n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    if verbose:
+        print("Parameter Count: all {:,d}; trainable {:,d}".format(n_all, n_trainable))
+    return n_all, n_trainable
+
diff --git a/utils/temporal_nms.py b/utils/temporal_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..545ed8045d7da4a6a831395029e39c0f803025d5
--- /dev/null
+++ b/utils/temporal_nms.py
@@ -0,0 +1,74 @@
+"""
+Non-Maximum Suppression for video proposals.
+"""
+
+
+def compute_temporal_iou(pred, gt):
+    """ deprecated due to performance concerns
+    compute intersection-over-union along temporal axis
+    Args:
+        pred: [st (float), ed (float)]
+        gt: [st (float), ed (float)]
+    Returns:
+        iou (float):
+
+    Ref: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py
+    """
+    intersection = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0]))
+    union = max(pred[1], gt[1]) - min(pred[0], gt[0])  # not the correct union though
+    if union == 0:
+        return 0
+    else:
+        return 1.0 * intersection / union
+
+
+def temporal_non_maximum_suppression(predictions, nms_threshold, max_after_nms=100):
+    """
+    Args:
+        predictions: list(sublist), each sublist is [st (float), ed(float), score (float)],
+            note larger scores are better and are preserved. For metrics that are better when smaller,
+            please convert to its negative, e.g., convert distance to negative distance.
+        nms_threshold: float in [0, 1]
+        max_after_nms:
+    Returns:
+        predictions_after_nms: list(sublist), each sublist is [st (float), ed(float), score (float)]
+    References:
+        https://github.com/wzmsltw/BSN-boundary-sensitive-network/blob/7b101fc5978802aa3c95ba5779eb54151c6173c6/Post_processing.py#L42
+    """
+    if len(predictions) == 1:  # only has one prediction, no need for nms
+        return predictions
+
+    predictions = sorted(predictions, key=lambda x: x[2], reverse=True)  # descending order
+
+    tstart = [e[0] for e in predictions]
+    tend = [e[1] for e in predictions]
+    tscore = [e[2] for e in predictions]
+    rstart = []
+    rend = []
+    rscore = []
+    while len(tstart) > 1 and len(rscore) < max_after_nms:  # max 100 after nms
+        idx = 1
+        while idx < len(tstart):  # compare with every prediction in the list.
+            if compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]) > nms_threshold:
+                # rm highly overlapped lower score entries.
+                tstart.pop(idx)
+                tend.pop(idx)
+                tscore.pop(idx)
+                # print("--------------------------------")
+                # print(compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]))
+                # print([tstart[0], tend[0]], [tstart[idx], tend[idx]])
+                # print(tstart.pop(idx), tend.pop(idx), tscore.pop(idx))
+            else:
+                # move to next
+                idx += 1
+        rstart.append(tstart.pop(0))
+        rend.append(tend.pop(0))
+        rscore.append(tscore.pop(0))
+
+    if len(rscore) < max_after_nms and len(tstart) >= 1:  # add the last, possibly empty.
+        rstart.append(tstart.pop(0))
+        rend.append(tend.pop(0))
+        rscore.append(tscore.pop(0))
+
+    predictions_after_nms = [[st, ed, s] for s, st, ed in zip(rscore, rstart, rend)]
+    return predictions_after_nms
diff --git a/utils/tensor_utils.py b/utils/tensor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..72497127fdbbd935bfc8c42b5fae723db04d73f8
--- /dev/null
+++ b/utils/tensor_utils.py
@@ -0,0 +1,141 @@
+import numpy as np
+import torch
+
+
+def pad_sequences_1d(sequences, dtype=torch.long, device=torch.device("cpu"), fixed_length=None):
+    """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
+    into a (n+1)-d array, only allow the first dim has variable lengths.
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: np.dtype or torch.dtype
+        device:
+        fixed_length: pad all seq in sequences to fixed length. All seq should have a length <= fixed_length.
+            return will be of shape [len(sequences), fixed_length, ...]
+    Returns:
+        padded_seqs: ((n+1)-d tensor) padded with zeros
+        mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
+              1 indicate valid, 0 otherwise
+    Examples:
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=torch.long)
+        >>> test_data_3d = [torch.randn(2,3,4), torch.randn(4,3,4), torch.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=torch.float)
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=np.float32)
+        >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=np.float32)
+    """
+    if isinstance(sequences[0], list):
+        if "torch" in str(dtype):
+            sequences = [torch.tensor(s, dtype=dtype, device=device) for s in sequences]
+        else:
+            sequences = [np.asarray(s, dtype=dtype) for s in sequences]
+
+    extra_dims = sequences[0].shape[1:]  # the extra dims should be the same for all elements
+    lengths = [len(seq) for seq in sequences]
+    if fixed_length is not None:
+        max_length = fixed_length
+    else:
+        max_length = max(lengths)
+    if isinstance(sequences[0], torch.Tensor):
+        assert "torch" in str(dtype), "dtype and input type does not match"
+        padded_seqs = torch.zeros((len(sequences), max_length) + extra_dims, dtype=dtype, device=device)
+        mask = torch.zeros((len(sequences), max_length), dtype=torch.float32, device=device)
+    else:  # np
+        assert "numpy" in str(dtype), "dtype and input type does not match"
+        padded_seqs = np.zeros((len(sequences), max_length) + extra_dims, dtype=dtype)
+        mask = np.zeros((len(sequences), max_length), dtype=np.float32)
+
+    for idx, seq in enumerate(sequences):
+        end = lengths[idx]
+        padded_seqs[idx, :end] = seq
+        mask[idx, :end] = 1
+    return padded_seqs, mask  # , lengths
+
+
+def pad_sequences_2d(sequences, dtype=torch.long):
+    """ Pad a double-nested list or a sequence of n-d torch tensor into a (n+1)-d tensor,
+        only allow the first two dims has variable lengths
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: torch.long for word indices / torch.float (float32) for other cases
+    Returns:
+    Examples:
+        >>> test_data_list = [[[1, 3, 5], [3, 7, 4, 1]], [[98, 34, 11, 89, 90], [22], [34, 56]],]
+        >>> pad_sequences_2d(test_data_list, dtype=torch.long)  # torch.Size([2, 3, 5])
+        >>> test_data_3d = [torch.randn(2,2,4), torch.randn(4,3,4), torch.randn(1,5,4)]
+        >>> pad_sequences_2d(test_data_3d, dtype=torch.float)  # torch.Size([2, 3, 5])
+        >>> test_data_3d2 = [[torch.randn(2,4), ], [torch.randn(3,4), torch.randn(5,4)]]
+        >>> pad_sequences_2d(test_data_3d2, dtype=torch.float)  # torch.Size([2, 3, 5])
+    # TODO add support for numpy array
+    """
+    bsz = len(sequences)
+    para_lengths = [len(seq) for seq in sequences]
+    max_para_len = max(para_lengths)
+    sen_lengths = [[len(word_seq) for word_seq in seq] for seq in sequences]
+    max_sen_len = max([max(e) for e in sen_lengths])
+
+    if isinstance(sequences[0], torch.Tensor):
+        extra_dims = sequences[0].shape[2:]
+    elif isinstance(sequences[0][0], torch.Tensor):
+        extra_dims = sequences[0][0].shape[1:]
+    else:
+        sequences = [[torch.Tensor(word_seq, dtype=dtype) for word_seq in seq] for seq in sequences]
+        extra_dims = ()
+
+    padded_seqs = torch.zeros((bsz, max_para_len, max_sen_len) + extra_dims, dtype=dtype)
+    mask = torch.zeros(bsz, max_para_len, max_sen_len).float()
+
+    for b_i in range(bsz):
+        for sen_i, sen_l in enumerate(sen_lengths[b_i]):
+            padded_seqs[b_i, sen_i, :sen_l] = sequences[b_i][sen_i]
+            mask[b_i, sen_i, :sen_l] = 1
+    return padded_seqs, mask  # , sen_lengths
+
+
+def find_max_triples(st_prob, ed_prob, top_n=5, prob_thd=None, tensor_type="torch"):
+    """ Find a list of (k1, k2) where k1 < k2 with the maximum values of st_prob[k1] * ed_prob[k2]
+    Args:
+        st_prob (torch.Tensor or np.ndarray): (N, L) batched start_idx probabilities
+        ed_prob (torch.Tensor  or np.ndarray): (N, L) batched end_idx probabilities
+        top_n (int): return topN pairs with highest values
+        prob_thd (float):
+        tensor_type: str, np or torch
+    Returns:
+        batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
+    """
+    if tensor_type == "torch":
+        st_prob, ed_prob = st_prob.data.numpy(), ed_prob.data.numpy()
+    product = np.einsum("bm,bn->bmn", st_prob, ed_prob)
+    # (N, L, L) the lower part becomes zeros, start_idx < ed_idx
+    upper_product = np.triu(product, k=1)
+    return find_max_triples_from_upper_triangle_product(upper_product, top_n=top_n, prob_thd=prob_thd)
+
+
+def find_max_triples_from_upper_triangle_product(upper_product, top_n=5, prob_thd=None):
+    """ Find a list of (k1, k2) where k1 < k2 with the maximum values of p1[k1] * p2[k2]
+    Args:
+        upper_product (torch.Tensor or np.ndarray): (N, L, L), the lower part becomes zeros, end_idx > start_idx
+        top_n (int): return topN pairs with highest values
+        prob_thd (float or None):
+    Returns:
+        batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
+    """
+    batched_sorted_triple = []
+    for idx, e in enumerate(upper_product):
+        sorted_triple = top_n_array_2d(e, top_n=top_n)
+        if prob_thd is not None:
+            sorted_triple = sorted_triple[sorted_triple[2] >= prob_thd]
+        batched_sorted_triple.append(sorted_triple)
+    return batched_sorted_triple
+
+
+def top_n_array_2d(array_2d, top_n):
+    """ Get topN indices and values of a 2d array, return a tuple of indices and their values,
+    ranked by the value
+    """
+    row_indices, column_indices = np.unravel_index(np.argsort(array_2d, axis=None), array_2d.shape)
+    row_indices = row_indices[::-1][:top_n]
+    column_indices = column_indices[::-1][:top_n]
+    sorted_values = array_2d[row_indices, column_indices]
+    return np.stack([row_indices, column_indices, sorted_values], axis=1)  # (N, 3)