diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..c3875fbc435c83968ce93faeba9a70338d202e3f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.log filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2812bcc7000ec0198a56d5c057701c81720e5d06
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*__pycache__
diff --git a/README.md b/README.md
index 7be5fc7f47d5db027d120b8024982df93db95b74..96a3416a020a0c2cbfee440d7066b2bcb4bafe94 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,47 @@
----
-license: mit
----
+---
+license: mit
+datasets:
+- axgroup/Ranking_TVR
+language:
+- en
+---
+# CONQUER_RVMR
+
+This repository contains the XML model for the baseline of the Ranked Video Moment Retrieval (RVMR) task. The associated paper is titled "Video Moment Retrieval in Practical Setting: A Dataset of Ranked Moments for Imprecise Queries."
+
+The main repository of the paper is [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking), and this model is adapted from [CONQUER](https://github.com/houzhijian/CONQUER.git). The environment setup is the same as for RelocNet_RVMR, as detailed in the [TVR-Ranking](https://huggingface.co/axgroup/TVR-Ranking) repository.
+
+
+CONQUER leverages video retrieval results from [HERO](https://github.com/linjieli222/HERO.git). We continue to use these
+results when training on our TVR-Ranking dataset. Note that, because the HERO results are obtained from the TVR dataset, there could be a data leak issue in our task setting. However, this issue is negligible for two reasons: (i) the queries used in our setting is imprecise query with query re-written, and (ii) a query has multiple ground truth moments in our task setting, which was not annotated in the original TVR dataset. 
+
+
+## Performance
+
+
+| **Model**  | **Train Set Top N** | **IoU=0.3** |          | **IoU=0.5** |          | **IoU=0.7** |          |
+|------------|---------------------|-------------|----------|-------------|----------|-------------|----------|
+|            |                     | **Val**     | **Test** | **Val**     | **Test** | **Val**     | **Test** |
+| **NDCG@10**|                     |             |          |             |          |             |          |
+| CONQUER    | 1                   | 0.0999      | 0.0859   | 0.0844      | 0.0709   | 0.0530      | 0.0512   |
+| CONQUER    | 20                  | 0.2406      | 0.2249   | 0.2222      | 0.2104   | 0.1672      | 0.1517   |
+| CONQUER    | 40                  | 0.2450      | 0.2219   | 0.2262      | 0.2085   | 0.1670      | 0.1515   |
+| **NDCG@20**|                     |             |          |             |          |             |          |
+| CONQUER    | 1                   | 0.0952      | 0.0835   | 0.0808      | 0.0687   | 0.0526      | 0.0484   |
+| CONQUER    | 20                  | 0.2130      | 0.1995   | 0.1976      | 0.1867   | 0.1527      | 0.1368   |
+| CONQUER    | 40                  | 0.2183      | 0.1968   | 0.2022      | 0.1851   | 0.1524      | 0.1365   |
+| **NDCG@40**|                     |             |          |             |          |             |          |
+| CONQUER    | 1                   | 0.0974      | 0.0866   | 0.0832      | 0.0718   | 0.0557      | 0.0510   |
+| CONQUER    | 20                  | 0.2029      | 0.1906   | 0.1891      | 0.1788   | 0.1476      | 0.1326   |
+| CONQUER    | 40                  | 0.2080      | 0.1885   | 0.1934      | 0.1775   | 0.1473      | 0.1323   |
+
+
+## Quick Start
+
+Modify the path in `run_disjoint_top20.sh` and then execute the script:
+
+```sh
+sh run_disjoint_top20.sh
+```
+
+Feel free to contribute or raise issues for any problems encountered.
\ No newline at end of file
diff --git a/config/config.py b/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a407a816d85d1fb34422dd61a3e8d548a9541ac9
--- /dev/null
+++ b/config/config.py
@@ -0,0 +1,227 @@
+import os
+import time
+import torch
+import argparse
+import sys
+import pprint
+
+import json
+from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
+
+
+def parse_with_config(parser):
+    args = parser.parse_args()
+    if args.config is not None:
+        config_args = json.load(open(args.config))
+        override_keys = {arg[2:].split('=')[0] for arg in sys.argv[1:]
+                         if arg.startswith('--')}
+        for k, v in config_args.items():
+            if k not in override_keys:
+                setattr(args, k, v)
+    del args.config
+    return args
+
+
+class BaseOptions(object):
+    saved_option_filename = "opt.json"
+    ckpt_filename = "model.ckpt"
+    tensorboard_log_dir = "tensorboard_log"
+    train_log_filename = "train.log.txt"
+    eval_log_filename = "eval.log.txt"
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        self.initialized = False
+        self.opt = None
+
+    def initialize(self):
+        self.initialized = True
+        self.parser.add_argument("--dset_name", type=str, default="tvr", choices=["tvr", "didemo"])
+        self.parser.add_argument("--eval_split_name", type=str, default="val",
+                                 help="should match keys in video_duration_idx_path, must set for VCMR")
+        self.parser.add_argument("--data_ratio", type=float, default=1.0,
+                                 help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
+                                      "Use small portion for debug purposes. Note this is different from --debug, "
+                                      "which works by breaking the loops, typically they are not used together.")
+        self.parser.add_argument("--debug", action="store_true",
+                                 help="debug (fast) mode, break all loops, do not load all data into memory.")
+        self.parser.add_argument("--disable_eval", action="store_true",
+                                 help="disable eval")
+        self.parser.add_argument("--results_root", type=str, default="results")
+        self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
+        self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
+        self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
+        self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
+        self.parser.add_argument("--num_workers", type=int, default=8,
+                                 help="num subprocesses used to load the data, 0: use main process")
+
+        # training config
+        self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
+        self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01,
+                                 help="Proportion of training to perform linear learning rate warmup for. "
+                                      "E.g., 0.1 = 10% of training.")
+        self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
+        self.parser.add_argument("--n_epoch", type=int, default=50, help="number of epochs to run")
+        self.parser.add_argument("--max_es_cnt", type=int, default=3,
+                                 help="number of epochs to early stop, use -1 to disable early stop")
+        self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+",
+                                 default=["VCMR", "SVMR", "VR"], choices=["VCMR", "SVMR", "VR"],
+                                 help="evaluate and report  numbers for tasks specified here.")
+        self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
+        self.parser.add_argument("--eval_query_bsz", type=int, default=8,
+                                 help="mini-batch size at inference, for query")
+        self.parser.add_argument("--no_eval_untrained", action="store_true", help="Evaluate on un-trained model")
+        self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
+        self.parser.add_argument("--eval_epoch_num", type=int, default=1, help="eval_epoch_num")
+
+        # Data config
+        self.parser.add_argument("--max_ctx_len", type=int, default=100,
+                                 help="max number of snippets, 100 for tvr clip_length=1.5, only 109/21825 > 100")
+        self.parser.add_argument("--max_desc_len", type=int, default=30, help="max number of query token")
+        self.parser.add_argument("--clip_length", type=float, default=1.5,
+                                 help="each video will be uniformly segmented into small clips")
+        self.parser.add_argument("--ctx_mode", type=str, default="visual_sub",
+                                 help="adopted modality list for each clip")
+        self.parser.add_argument("--dataset_config", type=str,help="data config")
+
+
+        # Model config
+
+        self.parser.add_argument("--visual_dim", type=int,default=4352,help="visual modality feature dimension")
+        self.parser.add_argument("--text_dim", type=int, default=768, help="textual modality feature dimension")
+        self.parser.add_argument("--query_dim", type=int, default=768, help="query feature dimension")
+        self.parser.add_argument("--hidden_dim", type=int, default=768, help="joint dimension")
+        self.parser.add_argument("--no_output_moe_weight",action="store_true",
+                                 help="whether NOT to use query dependent fusion")
+        self.parser.add_argument("--model_config", type=str, help="model config")
+
+
+        ## Train config
+        self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for moment cross-entropy loss")
+        self.parser.add_argument("--lw_video_ce", type=float, default=0.05, help="weight for video cross-entropy loss")
+        self.parser.add_argument("--lr_mul", type=float, default=1, help="Learning rate multiplier for backbone module")
+        self.parser.add_argument("--use_extend_pool", type=int, default=1000,
+                                 help="use_extend_pool")
+        self.parser.add_argument("--neg_video_num",type=int,default=3,
+                                 help="sample the number of negative video, "
+                                      "if neg_video_num=0, then disable shared normalization training objective")
+        self.parser.add_argument("--encoder_pretrain_ckpt_filepath", type=str,
+                                 default="None",
+                                 help="first_stage_pretrain checkpoint")
+        self.parser.add_argument("--use_interal_vr_scores", action="store_true",
+                                 help="whether to interal_vr_scores, true only for general similarity measure function")
+
+        ## Eval config
+        self.parser.add_argument("--similarity_measure",
+                                 type=str, choices=["general", "exclusive","disjoint"],
+                                 default="general",help="similarity_measure_function")
+        # post processing
+        self.parser.add_argument("--min_pred_l", type=int, default=0,
+                                 help="constrain the [st, ed] with ed - st >= 1"
+                                      "(1 clips with length 1.5 each, 1.5 secs in total"
+                                      "this is the min length for proposal-based method)")
+        self.parser.add_argument("--max_pred_l", type=int, default=24,
+                                 help="constrain the [st, ed] pairs with ed - st <= 24, 36 secs in total"
+                                      "(24 clips with length 1.5 each, "
+                                      "this is the max length for proposal-based method)")
+        self.parser.add_argument("--max_before_nms", type=int, default=200)
+        self.parser.add_argument("--max_vcmr_video", type=int, default=10,
+                                 help="ranking in top-max_vcmr_video")
+        self.parser.add_argument("--nms_thd", type=float, default=-1,
+                                 help="additionally use non-maximum suppression "
+                                      "(or non-minimum suppression for distance)"
+                                      "to post-processing the predictions. "
+                                      "-1: do not use nms. 0.7 for tvr")
+        self.parser.add_argument("--eval_num_per_epoch", type=float)
+
+        # can use config files
+        self.parser.add_argument('--config', help='JSON config files')
+        self.parser.add_argument('--model_name', type=str)
+
+
+    def display_save(self, opt):
+        args = vars(opt)
+        # Display settings
+        # print("------------ Options -------------\n{}\n-------------------"
+        #       .format({str(k): str(v) for k, v in sorted(args.items())}))
+        print("------------ Options -------------\n{}\n-------------------"
+              .format(pprint.pformat({str(k): str(v) for k, v in sorted(args.items())}, indent=4)))
+
+
+        # Save settings
+        if not isinstance(self, TestOptions):
+            option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
+            save_json(args, option_file_path, save_pretty=True)
+
+
+    def parse(self):
+        if not self.initialized:
+            self.initialize()
+        opt = parse_with_config(self.parser)
+
+        if opt.debug:
+            opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
+            #opt.disable_eval = True
+
+        if isinstance(self, TestOptions):
+
+            # modify model_dir to absolute path
+            opt.model_dir = os.path.join("results", opt.model_dir)
+
+            saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
+            for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
+                if arg not in ["results_root", "nms_thd", "debug", "dataset_config", "model_config","device",
+                               "eval_split_name", "bsz", "eval_context_bsz", "device_ids",
+                               "max_vcmr_video","max_pred_l", "min_pred_l", "external_inference_vr_res_path"]:
+                    setattr(opt, arg, saved_options[arg])
+        else:
+            if opt.exp_id is None:
+                raise ValueError("--exp_id is required for at a training option!")
+
+            opt.results_dir = os.path.join(opt.results_root,
+                                           "-".join([opt.dset_name, opt.exp_id,
+                                                     time.strftime("%Y_%m_%d_%H_%M_%S")]))
+            mkdirp(opt.results_dir)
+            # save a copy of current code
+            code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+            code_zip_filename = os.path.join(opt.results_dir, "code.zip")
+            make_zipfile(code_dir, code_zip_filename,
+                         enclosing_dir="code",
+                         exclude_dirs_substring="results",
+                         exclude_dirs=["condor","data","results", "debug_results", "__pycache__"],
+                         exclude_extensions=[".pyc", ".ipynb", ".swap"],)
+
+        self.display_save(opt)
+
+
+        # assert opt.stop_task in opt.eval_tasks_at_training
+        opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
+        opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
+        opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
+        opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
+        opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
+
+        self.opt = opt
+        return opt
+
+
+class TestOptions(BaseOptions):
+    """add additional options for evaluating"""
+    def initialize(self):
+        BaseOptions.initialize(self)
+        # also need to specify --eval_split_name
+        self.parser.add_argument("--eval_id", type=str, help="evaluation id")
+        self.parser.add_argument("--model_dir", type=str,
+                                 help="dir contains the model file, will be converted to absolute path afterwards")
+        self.parser.add_argument("--tasks", type=str, nargs="+",
+                                 choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"],
+                                 help="Which tasks to run."
+                                      "VCMR: Video Corpus Moment Retrieval;"
+                                      "SVMR: Single Video Moment Retrieval;"
+                                      "VR: regular Video Retrieval. (will be performed automatically with VCMR)")
+
+if __name__ == '__main__':
+    print(__file__)
+    print(os.path.realpath(__file__))
+    code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+    print(code_dir)
\ No newline at end of file
diff --git a/config/model_config.json b/config/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..02d3eeb3cfdbbee1fb0562df1ef72609a2703b3d
--- /dev/null
+++ b/config/model_config.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1458b56e285bd34b5db29a8e6babc61f9bf02d377a7ce594579baa833190f582
+size 1637
diff --git a/config/tvr_ranking_data_config_top01.json b/config/tvr_ranking_data_config_top01.json
new file mode 100644
index 0000000000000000000000000000000000000000..3caca09cd16eb6ac4e7e85d97923e12add2d00d5
--- /dev/null
+++ b/config/tvr_ranking_data_config_top01.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03ed22c7ab836800651a9ab882496e71d93266bb6dff35c13d308243d1a5c98e
+size 926
diff --git a/config/tvr_ranking_data_config_top20.json b/config/tvr_ranking_data_config_top20.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb6959266f20ae3b1517c96992f34609b2737761
--- /dev/null
+++ b/config/tvr_ranking_data_config_top20.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:509c13907d08921dd59c41b040166b4e0fd6e49260fa79adca9d23f46a804f70
+size 926
diff --git a/config/tvr_ranking_data_config_top40.json b/config/tvr_ranking_data_config_top40.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8405cdf6926a3bfae29f14e491eac787b01837c
--- /dev/null
+++ b/config/tvr_ranking_data_config_top40.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75a6540a46a85534dcf79b5049cc47053cd48232f6983268a584565b4a55d48b
+size 926
diff --git a/data_loader/second_stage_start_end_dataset.py b/data_loader/second_stage_start_end_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..39ebae56465c5eec920bd5d99f451243c1452390
--- /dev/null
+++ b/data_loader/second_stage_start_end_dataset.py
@@ -0,0 +1,349 @@
+import torch
+from torch.utils.data import Dataset
+import math
+import os
+import random
+import numpy as np
+from utils.basic_utils import load_json, l2_normalize_np_array, load_json
+import h5py
+
+
+class StartEndDataset(Dataset):
+    """
+    Args:
+        dset_name, str, ["tvr"]
+    Return:
+        a dict: {
+            "model_inputs": {
+                "query"
+                    "feat": torch.tensor, (max_desc_len, D_q)
+                    "feat_mask": torch.tensor, (max_desc_len)
+                    "feat_pos_id": torch.tensor, (max_desc_len)
+                    "feat_token_id": torch.tensor, (max_desc_len)
+                "visual"
+                    "feat": torch.tensor, (max_ctx_len, D_video)
+                    "feat_mask": torch.tensor, (max_ctx_len)
+                    "feat_pos_id": torch.tensor, (max_ctx_len)
+                    "feat_token_id": torch.tensor, (max_ctx_len)
+                "sub" (optional)
+                "st_ed_indices": torch.LongTensor, (2, )
+            }
+        }
+    """
+    def __init__(self, config, data_path, vr_rank_path, max_ctx_len=100, max_desc_len=30, clip_length=1.5,ctx_mode="visual_sub",
+                 is_eval = False, mode = "train",
+                 neg_video_num=3, data_ratio=1,
+                 use_extend_pool=500, inference_top_k=10):
+
+
+        self.dset_name = config.dset_name
+        self.root_path = config.root_path
+
+        self.desc_bert_path = os.path.join(self.root_path,config.desc_bert_path)
+        self.vid_feat_path = os.path.join(self.root_path,config.vid_feat_path)
+
+        self.ctx_mode = ctx_mode
+        self.use_sub = "sub" in self.ctx_mode
+
+        if self.use_sub:
+            self.sub_bert_path = os.path.join(self.root_path, config.sub_bert_path)
+
+        self.max_ctx_len = max_ctx_len
+        self.max_desc_len = max_desc_len
+        self.clip_length = clip_length
+
+        self.neg_video_num = neg_video_num
+        self.is_eval = is_eval
+
+        self.mode = mode
+        if mode in ["val", "test"]:
+            #  = load_json(data_path)
+            self.annotations = load_json(data_path)
+            self.ground_truth = self.get_relevant_moment_gt()
+            self.annotations = self.expand_annotations( self.annotations)
+        if mode == "train":
+            self.annotations = self.expand_annotations(load_json(data_path))
+
+        self.first_VR_ranklist_pool_txn = h5py.File(vr_rank_path, "r")
+        self.query_bert_h5 = h5py.File(self.desc_bert_path, "r")
+        self.vid_feat_txn = h5py.File(self.vid_feat_path, "r")
+        if self.use_sub:
+            self.sub_bert_txn = h5py.File(self.sub_bert_path, "r")
+
+
+        self.inference_top_k = inference_top_k
+        video_data = load_json(os.path.join(self.root_path,config.video_duration_idx_path))
+
+        self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()]
+        self.video2idx = {k: v[1] for k, v in video_data.items()}
+        self.idx2video = {v[1]:k for k, v in video_data.items()}
+        self.use_extend_pool = use_extend_pool
+
+        self.normalize_vfeat = True
+        self.normalize_tfeat = False
+
+        self.visual_token_id = 0
+        self.text_token_id = 1
+
+    def __len__(self):
+        return len(self.annotations)
+
+    def expand_annotations(self, annotations):
+        new_annotations = []
+        for i in annotations:
+            query = i["query"]
+            query_id = i["query_id"]
+            for moment in  i["relevant_moment"]:
+                moment.update({'query': query, 'query_id': query_id})
+                new_annotations.append(moment)
+        return new_annotations
+    
+    def get_relevant_moment_gt(self):
+        gt_all = {}
+        for data in self.annotations:
+            gt_all[data["query_id"]] = data["relevant_moment"]
+        return gt_all
+    
+    
+    def pad_feature(self, feature, max_ctx_len):
+        """
+            Args:
+                feature: original feature without padding
+                max_ctx_len: the maximum length of video clips (or query token)
+
+            Returns:
+                 feat_pad : padded feature
+                 feat_mask : feature mask
+        """
+        N_clip, feat_dim = feature.shape
+
+        feat_pad = torch.zeros((max_ctx_len, feat_dim))
+        feat_mask = torch.zeros(max_ctx_len, dtype=torch.long)
+        feat_pad[:N_clip, :] = torch.from_numpy(feature)
+        feat_mask[:N_clip] = 1
+
+        return feat_pad , feat_mask
+
+    def get_query_feat_by_query_id(self, query_id, token_id=1):
+        """
+            Args:
+                query_id: unique query description id
+                token_id: specify modality embedding
+            Returns:
+                a dict for query: {
+                    "feat": torch.tensor, (max_desc_len, D_q)
+                    "feat_mask": torch.tensor, (max_desc_len)
+                    "feat_pos_id": torch.tensor, (max_desc_len)
+                    "feat_token_id": torch.tensor, (max_desc_len)
+                }
+        """
+        
+        query_feat = self.query_bert_h5[str(query_id)][:self.max_desc_len]
+
+        if self.normalize_tfeat:
+            query_feat = l2_normalize_np_array(query_feat)
+
+        feat_pad, feat_mask = \
+            self.pad_feature(query_feat, self.max_desc_len)
+
+        temp_model_inputs = dict()
+        temp_model_inputs["feat"] = feat_pad
+        temp_model_inputs["feat_mask"] = feat_mask
+        temp_model_inputs["feat_pos_id"] = torch.arange(self.max_desc_len, dtype=torch.long)
+        temp_model_inputs["feat_token_id"] = torch.full((self.max_desc_len,), token_id, dtype=torch.long)
+
+        return temp_model_inputs
+
+    def get_visual_feat_from_storage(self,vid_name):
+        """
+            Args:
+                vid_name: unique video description id
+            Returns:
+                visual_feat: torch.tensor, (max_ctx_len, D_v)
+                Use ResNet + SlowFast , D_v = 2048 + 2304 = 4352
+        """
+
+        visual_feat = self.vid_feat_txn[vid_name][:][:self.max_ctx_len]
+
+        if self.normalize_vfeat:
+            visual_feat = l2_normalize_np_array(visual_feat)
+
+        return visual_feat
+
+    def get_sub_feat_from_storage(self,vid_name):
+        """
+            Args:
+                vid_name: unique video description id
+            Returns:
+                visual_feat: torch.tensor, (max_ctx_len, D_s)
+                Use RoBERTa, D_s =768
+        """
+
+        sub_feat = self.sub_bert_txn[vid_name][:][:self.max_ctx_len]
+        
+        if self.normalize_tfeat:
+            sub_feat = l2_normalize_np_array(sub_feat)
+
+        return sub_feat
+
+    def __getitem__(self, index):
+
+        raw_data = self.annotations[index]
+        # if "video_name" not in raw_data.keys():
+        # initialize with basic data
+        meta = dict(
+            query_id=raw_data["query_id"],
+            desc=raw_data["query"],
+            vid_name=raw_data["video_name"],
+            ts=raw_data["timestamp"],
+        )
+
+        # If mode is test_public, no ground-truth video_id is provided. So use a fixed dummy ground-truth video_id
+        if self.mode =="test_public":
+            meta["vid_name"] = "placeholder"
+
+
+        model_inputs = dict()
+        ## query information
+        model_inputs["query"] = self.get_query_feat_by_query_id(meta["query_id"],
+                                                               token_id=self.text_token_id)
+
+        query_id = meta["query_id"]
+        if query_id == 7806:
+            query_id += 1
+        
+        _external_inference_vr_res = self.first_VR_ranklist_pool_txn[str(query_id)][:]
+        if not self.is_eval:
+            ##get the rank location of the ground-truth video for the first VR search engine
+            location = 100
+            for idx, item in enumerate(_external_inference_vr_res):
+                if meta["vid_name"] == self.idx2video[item[0]]:
+                    location = idx
+                    break
+
+            ##check all the location is below 100 when mode is train
+            # if self.mode =="train":
+                # assert  0<=location<100, meta["query_id"]
+
+            ##get the ranklist without the ground-truth video
+            negative_video_pool_list = [self.idx2video[item[0]] for item in _external_inference_vr_res if meta["vid_name"] != self.idx2video[item[0]] ]
+
+            ##sample neg_video_num negative videos for shared normalization
+            sampled_negative_video_pool = random.sample(negative_video_pool_list[:location+self.use_extend_pool],
+                                                            k=self.neg_video_num)
+            ##the complete sampled video list , [pos, neg1, neg2, ...]
+            total_vid_name_list = [meta["vid_name"],] + sampled_negative_video_pool
+
+            self.shared_video_num = 1 + self.neg_video_num
+
+        else:
+            ##during eval, use top-k videos recommended by the first VR search engine
+            inference_video_list = [ self.idx2video[item[0]] for item in _external_inference_vr_res[:self.inference_top_k]]
+            inference_video_scores = [ item[1] for item in _external_inference_vr_res[:self.inference_top_k]]
+            model_inputs["inference_vr_scores"] = torch.FloatTensor(inference_video_scores)
+            total_vid_name_list = [meta["vid_name"],] + inference_video_list
+            self.shared_video_num = 1 + self.inference_top_k
+
+        # sampled neg_video_num negative videos or top-k videos
+        meta["sample_vid_name_list"] = total_vid_name_list[1:]
+
+        """ 
+            a dict for visual modality: {
+                "feat": torch.tensor, (shared_video_num, max_ctx_len, D_v)
+                "feat_mask": torch.tensor, (shared_video_num, max_ctx_len)
+                "feat_pos_id": torch.tensor, (shared_video_num, max_ctx_len)
+                "feat_token_id": torch.tensor, (shared_video_num, max_ctx_len)
+            }
+        """
+        groundtruth_visual_feat = self.get_visual_feat_from_storage(meta["vid_name"])
+        ctx_l, feat_dim = groundtruth_visual_feat.shape
+
+        visual_feat_pad = torch.zeros((self.shared_video_num, self.max_ctx_len, feat_dim))
+        visual_feat_mask = torch.zeros((self.shared_video_num, self.max_ctx_len), dtype=torch.long)
+        visual_feat_pos_id = \
+            torch.repeat_interleave(torch.arange(self.max_ctx_len, dtype=torch.long).unsqueeze(0),
+                                    self.shared_video_num, dim=0)
+        visual_feat_token_id = torch.full((self.shared_video_num, self.max_ctx_len), self.visual_token_id,
+                                          dtype=torch.long)
+
+        for index, video_name in enumerate(total_vid_name_list,start=0):
+            visual_feat = self.get_visual_feat_from_storage(video_name)
+
+            feat_pad, feat_mask = \
+                self.pad_feature(visual_feat, self.max_ctx_len)
+
+            visual_feat_pad[index] = feat_pad
+            visual_feat_mask[index] = feat_mask
+
+        temp_model_inputs = dict()
+        temp_model_inputs["feat"] = visual_feat_pad
+        temp_model_inputs["feat_mask"] = visual_feat_mask
+        temp_model_inputs["feat_pos_id"] = visual_feat_pos_id
+        temp_model_inputs["feat_token_id"] = visual_feat_token_id
+
+        model_inputs["visual"] = temp_model_inputs
+
+        """ 
+              a dict for sub modality: {
+                  "feat": torch.tensor, (shared_video_num, max_ctx_len, D_t)
+                  "feat_mask": torch.tensor, (shared_video_num, max_ctx_len)
+                  "feat_pos_id": torch.tensor, (shared_video_num, max_ctx_len)
+                  "feat_token_id": torch.tensor, (shared_video_num, max_ctx_len)
+              }
+        """
+        if self.use_sub:
+            groundtruth_sub_feat = self.get_sub_feat_from_storage(meta["vid_name"])
+
+            _ , feat_dim = groundtruth_sub_feat.shape
+
+            sub_feat_pad = torch.zeros((self.shared_video_num, self.max_ctx_len, feat_dim))
+            sub_feat_mask = torch.zeros((self.shared_video_num, self.max_ctx_len), dtype=torch.long)
+            sub_feat_pos_id = \
+                torch.repeat_interleave(torch.arange(self.max_ctx_len, dtype=torch.long).unsqueeze(0),
+                                        self.shared_video_num, dim=0)
+            sub_feat_token_id = torch.full((self.shared_video_num, self.max_ctx_len), self.text_token_id, dtype=torch.long)
+
+            for index, video_name in enumerate(total_vid_name_list, start=0):
+                sub_feat = self.get_sub_feat_from_storage(video_name)
+
+                feat_pad, feat_mask = \
+                    self.pad_feature(sub_feat, self.max_ctx_len)
+
+                sub_feat_pad[index] = feat_pad
+                sub_feat_mask[index] = feat_mask
+
+            temp_model_inputs = dict()
+            temp_model_inputs["feat"] = sub_feat_pad
+            temp_model_inputs["feat_mask"] = sub_feat_mask
+            temp_model_inputs["feat_pos_id"] = sub_feat_pos_id
+            temp_model_inputs["feat_token_id"] = sub_feat_token_id
+
+            model_inputs["sub"] = temp_model_inputs
+
+        if not self.is_eval:
+            model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"],
+                                                                 max_idx=ctx_l - 1)
+
+        return dict(meta=meta, model_inputs=model_inputs)
+
+    def get_st_ed_label(self, ts, max_idx):
+        """
+        Args:
+            ts: [st (float), ed (float)] in seconds, ed > st
+            max_idx: length of the video
+
+        Returns:
+            [st_idx, ed_idx]: int,
+            ed_idx >= st_idx
+            st_idx, ed_idx both belong to [0, max_idx-1]
+
+        Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
+        clips should be indexed as [2: 6), the translated back ts should be [3:9].
+        # TODO which one is better, [2: 5] or [2: 6)
+        """
+        st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
+        ed_idx = min(math.ceil(ts[1] / self.clip_length) - 1, max_idx)  # st_idx could be the same as ed_idx
+        assert 0 <= st_idx <= ed_idx <= max_idx, (ts, st_idx, ed_idx, max_idx)
+        return torch.LongTensor([st_idx, ed_idx])
+
+
diff --git a/inference.py b/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..739ef26704661746726b3176c99e114a421f7ac5
--- /dev/null
+++ b/inference.py
@@ -0,0 +1,570 @@
+import os
+import pprint
+from tqdm import tqdm
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+
+from config.config import TestOptions
+from model.conquer import CONQUER
+from data_loader.second_stage_start_end_dataset import StartEndDataset as StartEndEvalDataset
+from utils.inference_utils  import \
+    get_submission_top_n, post_processing_vcmr_nms
+from utils.basic_utils import save_json , load_config
+from utils.tensor_utils import find_max_triples_from_upper_triangle_product
+from standalone_eval.eval import eval_retrieval
+from utils.model_utils import move_cuda , start_end_collate
+from utils.model_utils import VERY_NEGATIVE_NUMBER
+import logging
+from time import time
+from ndcg_iou_topk import calculate_ndcg_iou
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
+                    datefmt="%Y-%m-%d %H:%M:%S",
+                    level=logging.INFO)
+
+def generate_min_max_length_mask(array_shape, min_l, max_l):
+    """ The last two dimension denotes matrix of upper-triangle with upper-right corner masked,
+    below is the case for 4x4.
+    [[0, 1, 1, 0],
+     [0, 0, 1, 1],
+     [0, 0, 0, 1],
+     [0, 0, 0, 0]]
+
+    Args:
+        array_shape: np.shape??? The last two dimensions should be the same
+        min_l: int, minimum length of predicted span
+        max_l: int, maximum length of predicted span
+
+    Returns:
+
+    """
+    single_dims = (1, ) * (len(array_shape) - 2)
+    mask_shape = single_dims + array_shape[-2:]
+    extra_length_mask_array = np.ones(mask_shape, dtype=np.float32)  # (1, ..., 1, L, L)
+    mask_triu = np.triu(extra_length_mask_array, k=min_l)
+    mask_triu_reversed = 1 - np.triu(extra_length_mask_array, k=max_l)
+    final_prob_mask = mask_triu * mask_triu_reversed
+    return final_prob_mask  # with valid bit to be 1
+
+
+def get_svmr_res_from_st_ed_probs_disjoint(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx,
+                                  clip_length, min_pred_l, max_pred_l, max_before_nms):
+    """
+    Args:
+        svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1]
+        svmr_gt_ed_probs:
+        query_metas:
+        video2idx:
+        clip_length: float, how long each clip is in seconds
+        min_pred_l: int, minimum number of clips
+        max_pred_l: int, maximum number of clips
+        max_before_nms: get top-max_before_nms predictions for each query
+
+    Returns:
+
+    """
+    svmr_res = []
+    query_vid_names = [e["vid_name"] for e in query_metas]
+
+    # masking very long ones! Since most are relatively short.
+    # disjoint : b_i + e_i
+    _st_ed_scores = np.expand_dims(svmr_gt_st_probs,axis=2) + np.expand_dims(svmr_gt_ed_probs,axis=1)
+
+    _N_q = _st_ed_scores.shape[0]
+
+    _valid_prob_mask = np.logical_not(generate_min_max_length_mask(
+        _st_ed_scores.shape, min_l=min_pred_l, max_l=max_pred_l).astype(bool))
+
+    valid_prob_mask = np.tile(_valid_prob_mask,(_N_q, 1, 1))
+
+    # invalid location will become VERY_NEGATIVE_NUMBER!
+    _st_ed_scores[valid_prob_mask] = VERY_NEGATIVE_NUMBER
+
+    batched_sorted_triples = find_max_triples_from_upper_triangle_product(
+        _st_ed_scores, top_n=max_before_nms, prob_thd=None)
+    for i, q_vid_name in tqdm(enumerate(query_vid_names),
+                              desc="[SVMR] Loop over queries to generate predictions",
+                              total=len(query_vid_names)):  # i is query_id
+        q_m = query_metas[i]
+        video_idx = video2idx[q_vid_name]
+        _sorted_triples = batched_sorted_triples[i]
+        _sorted_triples[:, 1] += 1  # as we redefined ed_idx, which is inside the moment.
+        _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()]
+        cur_query_pred = dict(
+            query_id=q_m["query_id"],
+            desc=q_m["desc"],
+            predictions=cur_ranked_predictions
+        )
+        svmr_res.append(cur_query_pred)
+    return svmr_res
+
+
+def get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs, query_metas, video2idx,
+                                  clip_length, min_pred_l, max_pred_l, max_before_nms):
+    """
+    Args:
+        svmr_gt_st_probs: np.ndarray (N_queries, L, L), value range [0, 1]
+        svmr_gt_ed_probs:
+        query_metas:
+        video2idx:
+        clip_length: float, how long each clip is in seconds
+        min_pred_l: int, minimum number of clips
+        max_pred_l: int, maximum number of clips
+        max_before_nms: get top-max_before_nms predictions for each query
+
+    Returns:
+
+    """
+    svmr_res = []
+    query_vid_names = [e["vid_name"] for e in query_metas]
+
+    # masking very long ones! Since most are relatively short.
+    # general/exclusive :  \hat{b_i} * \hat{e_i}
+    st_ed_prob_product = np.einsum("bm,bn->bmn", svmr_gt_st_probs, svmr_gt_ed_probs)  # (N, L, L)
+
+    valid_prob_mask = generate_min_max_length_mask(st_ed_prob_product.shape, min_l=min_pred_l, max_l=max_pred_l)
+    st_ed_prob_product *= valid_prob_mask  # invalid location will become zero!
+
+    batched_sorted_triples = find_max_triples_from_upper_triangle_product(
+        st_ed_prob_product, top_n=max_before_nms, prob_thd=None)
+    for i, q_vid_name in tqdm(enumerate(query_vid_names),
+                              desc="[SVMR] Loop over queries to generate predictions",
+                              total=len(query_vid_names)):  # i is query_id
+        q_m = query_metas[i]
+        video_idx = video2idx[q_vid_name]
+        _sorted_triples = batched_sorted_triples[i]
+        _sorted_triples[:, 1] += 1  # as we redefined ed_idx, which is inside the moment.
+        _sorted_triples[:, :2] = _sorted_triples[:, :2] * clip_length
+        # [video_idx(int), st(float), ed(float), score(float)]
+        cur_ranked_predictions = [[video_idx, ] + row for row in _sorted_triples.tolist()]
+        cur_query_pred = dict(
+            query_id=q_m["query_id"],
+            desc=q_m["desc"],
+            predictions=cur_ranked_predictions
+        )
+        svmr_res.append(cur_query_pred)
+    return svmr_res
+
+
+
+def compute_query2ctx_info(model, eval_dataset, opt,
+                           max_before_nms=200, max_n_videos=100, tasks=("SVMR",)):
+    """
+    Use val set to do evaluation, remember to run with torch.no_grad().
+     model : CONQUER
+     eval_dataset :
+     opt :
+     max_before_nms : max moment number before non-maximum suppression
+     tasks: evaluation tasks
+
+     general/exclusive function : r * \hat{b_i} + \hat{e_i}
+    """
+    is_vr = "VR" in tasks
+    is_vcmr = "VCMR" in tasks
+    is_svmr = "SVMR" in tasks
+
+    video2idx = eval_dataset.video2idx
+
+    model.eval()
+    query_eval_loader = DataLoader(eval_dataset,
+                                   collate_fn= start_end_collate,
+                                   batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers,
+                                   shuffle=False,
+                                   pin_memory=True)
+
+    n_total_query = len(eval_dataset)
+    bsz = opt.eval_query_bsz
+
+    if is_vcmr:
+        flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int)
+        flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32)
+
+    if is_vr :
+        if opt.use_interal_vr_scores:
+            sorted_q2c_indices = np.tile(np.arange(max_n_videos, dtype=int),n_total_query).reshape(n_total_query,max_n_videos)
+            sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32)
+        else:
+            sorted_q2c_indices = np.empty((n_total_query, max_n_videos), dtype=int)
+            sorted_q2c_scores = np.empty((n_total_query, max_n_videos), dtype=np.float32)
+
+    if is_svmr:
+        svmr_gt_st_probs = np.zeros((n_total_query, opt.max_ctx_len), dtype=np.float32)
+        svmr_gt_ed_probs = np.zeros((n_total_query, opt.max_ctx_len), dtype=np.float32)
+
+    query_metas = []
+    for idx, batch in tqdm(
+            enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)):
+
+        _query_metas = batch["meta"]
+        query_metas.extend(batch["meta"])
+
+        if opt.device.type == "cuda":
+            model_inputs = move_cuda(batch["model_inputs"], opt.device)
+        else:
+            model_inputs = batch["model_inputs"]
+
+
+        video_similarity_score, begin_score_distribution, end_score_distribution = \
+            model.get_pred_from_raw_query(model_inputs)
+
+        if is_svmr:
+            _svmr_st_probs = begin_score_distribution[:, 0]
+            _svmr_ed_probs = end_score_distribution[:, 0]
+
+            # normalize to get true probabilities!!!
+            # the probabilities here are already (pad) masked, so only need to do softmax
+            _svmr_st_probs = F.softmax(_svmr_st_probs, dim=-1)  # (_N_q, L)
+            _svmr_ed_probs = F.softmax(_svmr_ed_probs, dim=-1)
+            if opt.debug:
+                print("svmr_st_probs: ", _svmr_st_probs)
+
+            svmr_gt_st_probs[idx * bsz:(idx + 1) * bsz] = \
+                _svmr_st_probs.cpu().numpy()
+
+            svmr_gt_ed_probs[idx * bsz:(idx + 1) * bsz] = \
+                _svmr_ed_probs.cpu().numpy()
+
+        _vcmr_st_prob = begin_score_distribution[:, 1:]
+        _vcmr_ed_prob = end_score_distribution[:, 1:]
+
+        if not (is_vr or is_vcmr):
+            continue
+
+        if opt.use_interal_vr_scores:
+            bs = begin_score_distribution.size()[0]
+            _sorted_q2c_indices = torch.arange(max_n_videos).to(begin_score_distribution.device).repeat(bs,1)
+            _sorted_q2c_scores = model_inputs["inference_vr_scores"]
+            if is_vr:
+                sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = model_inputs["inference_vr_scores"].cpu().numpy()
+        else:
+            video_similarity_score = video_similarity_score[:, 1:]
+            _query_context_scores = torch.softmax(video_similarity_score,dim=1)
+
+        # Get top-max_n_videos videos for each query
+        _sorted_q2c_scores, _sorted_q2c_indices = \
+            torch.topk(_query_context_scores, max_n_videos, dim=1, largest=True)
+        if is_vr:
+            sorted_q2c_indices[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_indices.cpu().numpy()
+            sorted_q2c_scores[idx * bsz:(idx + 1) * bsz] = _sorted_q2c_scores.cpu().numpy()
+
+
+        if not is_vcmr:
+            continue
+
+
+        # normalize to get true probabilities!!!
+        # the probabilities here are already (pad) masked, so only need to do softmax
+        _st_probs = F.softmax(_vcmr_st_prob, dim=-1)  # (_N_q, N_videos, L)
+        _ed_probs = F.softmax(_vcmr_ed_prob, dim=-1)
+
+
+        # Get VCMR results
+        # compute combined scores
+        row_indices = torch.arange(0, len(_st_probs), device=opt.device).unsqueeze(1)
+        _st_probs = _st_probs[row_indices, _sorted_q2c_indices]  # (_N_q, max_n_videos, L)
+        _ed_probs = _ed_probs[row_indices, _sorted_q2c_indices]
+
+        # (_N_q, max_n_videos, L, L)
+        # general/exclusive :  r * \hat{b_i} * \hat{e_i}
+        _st_ed_scores = torch.einsum("qvm,qv,qvn->qvmn", _st_probs, _sorted_q2c_scores, _ed_probs)
+
+        valid_prob_mask = generate_min_max_length_mask(
+            _st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l)
+
+        _st_ed_scores *= torch.from_numpy(
+            valid_prob_mask).to(_st_ed_scores.device)  # invalid location will become zero!
+
+        _n_q  = _st_ed_scores.shape[0]
+
+        # sort across the total_n_videos videos (by flatten from the 2nd dim)
+        # the indices here are local indices, not global indices
+
+        _flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1)  # (N_q, total_n_videos*L*L)
+        _flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \
+            torch.sort(_flat_st_ed_scores, dim=1, descending=True)
+
+        # collect data
+        flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_sorted_scores[:, :max_before_nms].detach().cpu().numpy()
+        flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_scores_sorted_indices[:, :max_before_nms].detach().cpu().numpy()
+
+        if opt.debug:
+            break
+
+    # Numpy starts here!!!
+    vr_res = []
+    if is_vr:
+        for i, (_sorted_q2c_scores_row, _sorted_q2c_indices_row) in tqdm(
+                enumerate(zip(sorted_q2c_scores, sorted_q2c_indices)),
+                desc="[VR] Loop over queries to generate predictions", total=n_total_query):
+            cur_vr_redictions = []
+            query_specific_video_metas = query_metas[i]["sample_vid_name_list"]
+            for j, (v_score, v_meta_idx) in enumerate(zip(_sorted_q2c_scores_row, _sorted_q2c_indices_row)):
+                video_idx = video2idx[query_specific_video_metas[v_meta_idx]]
+                cur_vr_redictions.append([video_idx, 0, 0, float(v_score)])
+            cur_query_pred = dict(
+                query_id=query_metas[i]["query_id"],
+                desc=query_metas[i]["desc"],
+                predictions=cur_vr_redictions
+            )
+            vr_res.append(cur_query_pred)
+
+    svmr_res = []
+    if is_svmr:
+        svmr_res = get_svmr_res_from_st_ed_probs(svmr_gt_st_probs, svmr_gt_ed_probs,
+                                                 query_metas, video2idx,
+                                                 clip_length=opt.clip_length,
+                                                 min_pred_l=opt.min_pred_l,
+                                                 max_pred_l=opt.max_pred_l,
+                                                 max_before_nms=max_before_nms)
+
+
+    vcmr_res = []
+    if is_vcmr:
+        for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm(
+                enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)),
+                desc="[VCMR] Loop over queries to generate predictions", total=n_total_query):  # i is query_idx
+            # list([video_idx(int), st(float), ed(float), score(float)])
+            video_meta_indices_local, pred_st_indices, pred_ed_indices = \
+                np.unravel_index(_flat_st_ed_scores_sorted_indices,
+                                 shape=(max_n_videos, opt.max_ctx_len, opt.max_ctx_len))
+            # video_meta_indices refers to the indices among the total_n_videos
+            # video_meta_indices_local refers to the indices among the top-max_n_videos
+            # video_meta_indices refers to the indices in all the videos, which is the True indices
+            video_meta_indices = sorted_q2c_indices[i, video_meta_indices_local]
+
+            pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length
+            pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length
+            cur_vcmr_redictions = []
+            query_specific_video_metas = query_metas[i]["sample_vid_name_list"]
+            for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices, _flat_st_ed_sorted_scores)):  # videos
+                video_idx = video2idx[query_specific_video_metas[v_meta_idx]]
+                cur_vcmr_redictions.append(
+                    [video_idx, float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j]), float(v_score)])
+
+            cur_query_pred = dict(
+                query_id=query_metas[i]["query_id"],
+                desc=query_metas[i]["desc"],
+                predictions=cur_vcmr_redictions)
+            vcmr_res.append(cur_query_pred)
+
+    res = dict(VCMR=vcmr_res, SVMR=svmr_res, VR=vr_res)
+    return {k: v for k, v in res.items() if len(v) != 0}
+
+
+def compute_query2ctx_info_disjoint(model, eval_dataset, opt,
+                           max_before_nms=200, max_n_videos=100, maxtopk = 40):
+    """Use val set to do evaluation, remember to run with torch.no_grad().
+     model : CONQUER
+     eval_dataset :
+     opt :
+     max_before_nms : max moment number before non-maximum suppression
+     tasks: evaluation tasks
+
+     disjoint function : b_i + e_i
+
+    """
+    video2idx = eval_dataset.video2idx
+
+    model.eval()
+    query_eval_loader = DataLoader(eval_dataset, collate_fn= start_end_collate, batch_size=opt.eval_query_bsz,
+                                   num_workers=opt.num_workers, shuffle=False, pin_memory=True)
+
+    n_total_query = len(eval_dataset)
+    bsz = opt.eval_query_bsz
+
+    flat_st_ed_scores_sorted_indices = np.empty((n_total_query, max_before_nms), dtype=int)
+    flat_st_ed_sorted_scores = np.zeros((n_total_query, max_before_nms), dtype=np.float32)
+
+
+    query_metas = []
+    for idx, batch in tqdm(
+            enumerate(query_eval_loader), desc="Computing q embedding", total=len(query_eval_loader)):
+
+        query_metas.extend(batch["meta"])
+        if opt.device.type == "cuda":
+            model_inputs = move_cuda(batch["model_inputs"], opt.device)
+
+        else:
+            model_inputs = batch["model_inputs"]
+
+        _ , begin_score_distribution, end_score_distribution =  model.get_pred_from_raw_query(model_inputs)
+
+        begin_score_distribution = begin_score_distribution[:,1:]
+        end_score_distribution= end_score_distribution[:,1:]
+
+        # Get VCMR results
+        # (_N_q, total_n_videos, L, L)
+        # b_i + e_i
+        _st_ed_scores = torch.unsqueeze(begin_score_distribution, 3) + torch.unsqueeze(end_score_distribution, 2)
+
+        _n_q, total_n_videos = _st_ed_scores.size()[:2]
+
+
+        ## mask the invalid location out of moment length constrain
+        _valid_prob_mask = np.logical_not(generate_min_max_length_mask(
+            _st_ed_scores.shape, min_l=opt.min_pred_l, max_l=opt.max_pred_l).astype(bool))
+
+        _valid_prob_mask = torch.from_numpy(_valid_prob_mask).to(_st_ed_scores.device)
+
+        valid_prob_mask = _valid_prob_mask.repeat(_n_q,total_n_videos,1,1)
+
+        # invalid location will become VERY_NEGATIVE_NUMBER!
+        _st_ed_scores[valid_prob_mask] = VERY_NEGATIVE_NUMBER
+
+        # sort across the total_n_videos videos (by flatten from the 2nd dim)
+        # the indices here are local indices, not global indices
+        _flat_st_ed_scores = _st_ed_scores.reshape(_n_q, -1)  # (N_q, total_n_videos*L*L)
+        _flat_st_ed_sorted_scores, _flat_st_ed_scores_sorted_indices = \
+            torch.sort(_flat_st_ed_scores, dim=1, descending=True)
+
+        # collect data
+        flat_st_ed_sorted_scores[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_sorted_scores[:, :max_before_nms].detach().cpu().numpy()
+        flat_st_ed_scores_sorted_indices[idx * bsz:(idx + 1) * bsz] = \
+            _flat_st_ed_scores_sorted_indices[:, :max_before_nms].detach().cpu().numpy()
+
+
+        
+    vcmr_res = {}
+    for i, (_flat_st_ed_scores_sorted_indices, _flat_st_ed_sorted_scores) in tqdm(
+            enumerate(zip(flat_st_ed_scores_sorted_indices, flat_st_ed_sorted_scores)),
+            desc="[VCMR] Loop over queries to generate predictions", total=n_total_query):  # i is query_idx
+        # list([video_idx(int), st(float), ed(float), score(float)])
+        video_meta_indices_local, pred_st_indices, pred_ed_indices = \
+            np.unravel_index(_flat_st_ed_scores_sorted_indices,
+                            shape=(total_n_videos, opt.max_ctx_len, opt.max_ctx_len))
+
+        pred_st_in_seconds = pred_st_indices.astype(np.float32) * opt.clip_length
+        pred_ed_in_seconds = pred_ed_indices.astype(np.float32) * opt.clip_length + opt.clip_length
+        cur_vcmr_redictions = []
+        query_specific_video_metas = query_metas[i]["sample_vid_name_list"]
+        for j, (v_meta_idx, v_score) in enumerate(zip(video_meta_indices_local, _flat_st_ed_sorted_scores)):  # videos
+            # video_idx = video2idx[query_specific_video_metas[v_meta_idx]]
+            cur_vcmr_redictions.append(
+                {
+                    "video_name": query_specific_video_metas[v_meta_idx],
+                    "timestamp": [float(pred_st_in_seconds[j]), float(pred_ed_in_seconds[j])],
+                    "model_scores": float(v_score)
+                }
+            )
+        query_id=query_metas[i]["query_id"]
+        vcmr_res[query_id] = cur_vcmr_redictions[:maxtopk]
+    return vcmr_res
+
+def get_eval_res(model, eval_dataset, opt):
+    """compute and save query and video proposal embeddings"""
+
+    if opt.similarity_measure  == "disjoint": #disjoint b_i+ e_i
+        eval_res = compute_query2ctx_info_disjoint(model, eval_dataset, opt,
+                                          max_before_nms=opt.max_before_nms,
+                                          max_n_videos=opt.max_vcmr_video)
+    elif opt.similarity_measure  in  ["general" , "exclusive" ] : # r * \hat{b_i} * \hat{e_i}
+        eval_res = compute_query2ctx_info(model, eval_dataset, opt,
+                                          max_before_nms=opt.max_before_nms,
+                                          max_n_videos=opt.max_vcmr_video,
+                                          tasks=tasks)
+
+
+    return eval_res
+
+
+POST_PROCESSING_MMS_FUNC = {
+    "SVMR": post_processing_vcmr_nms,
+    "VCMR": post_processing_vcmr_nms
+}
+
+def get_prediction_top_n(list_dict_predictions, top_n):
+    top_n_res = []
+    for e in list_dict_predictions:
+        e["predictions"] = e["predictions"][:top_n]
+        top_n_res.append(e)
+    return top_n_res
+
+
+def eval_epoch(model, eval_dataset, opt, max_after_nms, iou_thds, topks):
+
+    pred_data = get_eval_res(model, eval_dataset, opt)
+    # video2idx = eval_dataset.video2idx
+    # pred_data = get_prediction_top_n(eval_res, top_n=max_after_nms)
+    # pred_data = get_prediction_top_n(eval_res, top_n=max_after_nms)
+    gt_data = eval_dataset.ground_truth
+    average_ndcg = calculate_ndcg_iou(gt_data, pred_data, iou_thds, topks)
+    return average_ndcg, pred_data
+
+
+
+def setup_model(opt):
+    """Load model from checkpoint and move to specified device"""
+    checkpoint = torch.load(opt.ckpt_filepath)
+    loaded_model_cfg = checkpoint["model_cfg"]
+
+    model = CONQUER(loaded_model_cfg,
+                    visual_dim=opt.visual_dim,
+                    text_dim=opt.text_dim,
+                    query_dim=opt.query_dim,
+                    hidden_dim=opt.hidden_dim,
+                    video_len=opt.max_ctx_len,
+                    ctx_mode=opt.ctx_mode,
+                    no_output_moe_weight=opt.no_output_moe_weight,
+                    similarity_measure=opt.similarity_measure,
+                    use_debug = opt.debug)
+    model.load_state_dict(checkpoint["model"])
+
+    logger.info("Loaded model saved at epoch {} from checkpoint: {}"
+                .format(checkpoint["epoch"], opt.ckpt_filepath))
+
+    if opt.device.type == "cuda":
+        logger.info("CUDA enabled.")
+        model.to(opt.device)
+        assert len(opt.device_ids) == 1
+        # if len(opt.device_ids) > 1:
+        #     logger.info("Use multi GPU", opt.device_ids)
+        #     model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
+    return model
+
+
+def start_inference():
+    logger.info("Setup config, data and model...")
+    opt = TestOptions().parse()
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+
+    data_config = load_config(opt.dataset_config)
+
+    eval_dataset = StartEndEvalDataset(
+        config = data_config,
+        max_ctx_len=opt.max_ctx_len,
+        max_desc_len= opt.max_desc_len,
+        clip_length = opt.clip_length,
+        ctx_mode = opt.ctx_mode,
+        mode = opt.eval_split_name,
+        data_ratio = opt.data_ratio,
+        is_eval = True,
+        inference_top_k = opt.max_vcmr_video)
+
+    postfix = "_hero"
+    model = setup_model(opt)
+    save_submission_filename = "inference_{}_{}_{}_predictions_{}{}.json".format(
+        opt.dset_name, opt.eval_split_name, opt.eval_id, "_".join(opt.tasks),postfix)
+    print(save_submission_filename)
+    logger.info("Starting inference...")
+    with torch.no_grad():
+        metrics_no_nms, metrics_nms, latest_file_paths = \
+            eval_epoch(model, eval_dataset, opt, save_submission_filename,
+                       tasks=opt.tasks, max_after_nms=100)
+    logger.info("metrics_no_nms \n{}".format(pprint.pformat(metrics_no_nms, indent=4)))
+    logger.info("metrics_nms \n{}".format(pprint.pformat(metrics_nms, indent=4)))
+
+
+if __name__ == '__main__':
+    start_inference()
diff --git a/model/__init__.py b/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model/backbone/__init__.py b/model/backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model/backbone/encoder.py b/model/backbone/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a946a1184de753ea03309e9afff571a8b582a7
--- /dev/null
+++ b/model/backbone/encoder.py
@@ -0,0 +1,235 @@
+"""
+Pytorch modules
+some classes are modified from HuggingFace
+(https://github.com/huggingface/transformers)
+"""
+
+import torch
+import logging
+from torch import nn
+logger = logging.getLogger(__name__)
+
+try:
+  import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm
+except (ImportError, AttributeError) as e:
+  BertLayerNorm = torch.nn.LayerNorm
+
+from model.transformer.bert import BertEncoder
+from model.layers import (NetVLAD, LinearLayer)
+from model.transformer.bert_embed import (BertEmbeddings)
+from utils.model_utils import mask_logits
+import torch.nn.functional as F
+
+
+
+class TransformerBaseModel(nn.Module):
+    """
+    Base Transformer model
+    """
+    def __init__(self, config):
+        super(TransformerBaseModel, self).__init__()
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+
+    def forward(self,features,position_ids,token_type_ids,attention_mask):
+        # embedding layer
+        embedding_output = self.embeddings(token_type_ids=token_type_ids,
+                                           inputs_embeds=features,
+                                           position_ids=position_ids)
+
+        encoder_outputs = self.encoder(embedding_output, attention_mask)
+
+        sequence_output = encoder_outputs[0]
+
+        return sequence_output
+
+class TwoModalEncoder(nn.Module):
+    """
+        Two modality Transformer Encoder model
+    """
+
+    def __init__(self, config,img_dim,text_dim,hidden_dim,split_num,output_split=True):
+        super(TwoModalEncoder, self).__init__()
+        self.img_linear = LinearLayer(
+            in_hsz=img_dim, out_hsz=hidden_dim)
+        self.text_linear = LinearLayer(
+            in_hsz=text_dim, out_hsz=hidden_dim)
+
+        self.transformer = TransformerBaseModel(config)
+        self.output_split = output_split
+        if self.output_split:
+            self.split_num = split_num
+
+
+    def forward(self, visual_features, visual_position_ids, visual_token_type_ids, visual_attention_mask,
+                text_features,text_position_ids,text_token_type_ids,text_attention_mask):
+
+        transformed_im = self.img_linear(visual_features)
+        transformed_text = self.text_linear(text_features)
+
+        transformer_input_feat = torch.cat((transformed_im,transformed_text),dim=1)
+        transformer_input_feat_pos_id = torch.cat((visual_position_ids,text_position_ids),dim=1)
+        transformer_input_feat_token_id = torch.cat((visual_token_type_ids,text_token_type_ids),dim=1)
+        transformer_input_feat_mask = torch.cat((visual_attention_mask,text_attention_mask),dim=1)
+
+        output = self.transformer(features=transformer_input_feat,
+                                  position_ids=transformer_input_feat_pos_id,
+                                  token_type_ids=transformer_input_feat_token_id,
+                                  attention_mask=transformer_input_feat_mask)
+
+        if self.output_split:
+            return torch.split(output,self.split_num,dim=1)
+        else:
+            return output
+
+
+class OneModalEncoder(nn.Module):
+    """
+        One modality  Transformer Encoder model
+    """
+
+    def __init__(self, config,input_dim,hidden_dim):
+        super(OneModalEncoder, self).__init__()
+        self.linear = LinearLayer(
+            in_hsz=input_dim, out_hsz=hidden_dim)
+        self.transformer = TransformerBaseModel(config)
+
+    def forward(self, features, position_ids, token_type_ids, attention_mask):
+
+        transformed_features = self.linear(features)
+
+        output = self.transformer(features=transformed_features,
+                                  position_ids=position_ids,
+                                  token_type_ids=token_type_ids,
+                                  attention_mask=attention_mask)
+        return output
+
+
+class VideoQueryEncoder(nn.Module):
+    def __init__(self, config, video_modality,
+                 visual_dim=4352, text_dim= 768,
+                 query_dim=768, hidden_dim = 768,split_num=100,):
+        super(VideoQueryEncoder, self).__init__()
+        self.use_sub = len(video_modality) > 1
+        if self.use_sub:
+            self.videoEncoder = TwoModalEncoder(config=config.bert_config,
+                                                img_dim = visual_dim,
+                                                text_dim = text_dim ,
+                                                hidden_dim = hidden_dim,
+                                                split_num = split_num
+                                                )
+        else:
+            self.videoEncoder = OneModalEncoder(config=config.bert_config,
+                                                input_dim = visual_dim,
+                                                hidden_dim = hidden_dim,
+                                                )
+
+        self.queryEncoder = OneModalEncoder(config=config.query_bert_config,
+                                            input_dim= query_dim,
+                                            hidden_dim=hidden_dim,
+                                            )
+
+    def forward_repr_query(self, batch):
+
+        query_output = self.queryEncoder(
+            features=batch["query"]["feat"],
+            position_ids=batch["query"]["feat_pos_id"],
+            token_type_ids=batch["query"]["feat_token_id"],
+            attention_mask=batch["query"]["feat_mask"]
+        )
+
+        return query_output
+
+    def forward_repr_video(self,batch):
+        video_output = dict()
+
+        if len(batch["visual"]["feat"].size()) == 4:
+            bsz, num_video = batch["visual"]["feat"].size()[:2]
+            for key in batch.keys():
+                if key in ["visual", "sub"]:
+                    for key_2 in batch[key]:
+                        if key_2 in ["feat", "feat_mask", "feat_pos_id", "feat_token_id"]:
+                            shape_list = batch[key][key_2].size()[2:]
+                            batch[key][key_2] = batch[key][key_2].view((bsz * num_video,) + shape_list)
+
+
+        if self.use_sub:
+            video_output["visual"], video_output["sub"] = self.videoEncoder(
+                visual_features=batch["visual"]["feat"],
+                visual_position_ids=batch["visual"]["feat_pos_id"],
+                visual_token_type_ids=batch["visual"]["feat_token_id"],
+                visual_attention_mask=batch["visual"]["feat_mask"],
+                text_features=batch["sub"]["feat"],
+                text_position_ids=batch["sub"]["feat_pos_id"],
+                text_token_type_ids=batch["sub"]["feat_token_id"],
+                text_attention_mask=batch["sub"]["feat_mask"]
+            )
+        else:
+            video_output["visual"] = self.videoEncoder(
+                features=batch["visual"]["feat"],
+                position_ids=batch["visual"]["feat_pos_id"],
+                token_type_ids=batch["visual"]["feat_token_id"],
+                attention_mask=batch["visual"]["feat_mask"]
+            )
+
+        return video_output
+
+
+    def forward_repr_both(self, batch):
+        video_output = self.forward_repr_video(batch)
+        query_output = self.forward_repr_query(batch)
+
+        return {"video_feat": video_output,
+                "query_feat": query_output}
+
+    def forward(self,batch,task="repr_both"):
+
+        if task == "repr_both":
+            return self.forward_repr_both(batch)
+        elif task == "repr_video":
+            return self.forward_repr_video(batch)
+        elif task == "repr_query":
+            return self.forward_repr_query(batch)
+
+
+class QueryWeightEncoder(nn.Module):
+    """
+        Query Weight Encoder
+        Using NetVLAD to aggreate contextual query features
+        Using FC + Softmax to get fusion weights for each modality
+    """
+    def __init__(self, config, video_modality):
+        super(QueryWeightEncoder, self).__init__()
+
+        ##NetVLAD
+        self.text_pooling = NetVLAD(feature_size=config.hidden_size,cluster_size=config.text_cluster)
+        self.moe_txt_dropout = nn.Dropout(config.moe_dropout_prob)
+
+        ##FC
+        self.moe_fc_txt = nn.Linear(
+            in_features=self.text_pooling.out_dim,
+            out_features=len(video_modality),
+            bias=False)
+
+        self.video_modality = video_modality
+
+    def forward(self, query_feat):
+        ##NetVLAD
+        pooled_text = self.text_pooling(query_feat)
+        pooled_text = self.moe_txt_dropout(pooled_text)
+
+        ##FC + Softmax
+        moe_weights = self.moe_fc_txt(pooled_text)
+        softmax_moe_weights = F.softmax(moe_weights, dim=1)
+
+
+        moe_weights_dict = dict()
+        for modality, moe_weight in zip(self.video_modality, torch.split(softmax_moe_weights, 1, dim=1)):
+            moe_weights_dict[modality] = moe_weight.squeeze(1)
+
+        return  moe_weights_dict
+
+
+
+
diff --git a/model/conquer.py b/model/conquer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0139ff8bbac2c7259f056d574f1b0d3fef03eb6e
--- /dev/null
+++ b/model/conquer.py
@@ -0,0 +1,205 @@
+import torch
+import torch.nn as nn
+from model.backbone.encoder import VideoQueryEncoder, QueryWeightEncoder
+from model.qal.query_aware_learning_module import BiDirectionalAttention
+from model.layers import FCPlusTransformer#,MomentLocalizationHead
+from model.head.ml_head import MomentLocalizationHead
+from model.head.vs_head import VideoScoringHead
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+class CONQUER(nn.Module):
+    def __init__(self, config,
+                 visual_dim = 4352,
+                 text_dim = 768,
+                 query_dim = 768,
+                 hidden_dim = 768,
+                 video_len = 100,
+                 ctx_mode = "visual_sub",
+                 lw_st_ed = 0.01,
+                 lw_video_ce = 0.05,
+                 similarity_measure="general",
+                 use_debug=False,
+                 no_output_moe_weight=False):
+
+        super(CONQUER, self).__init__()
+        self.config = config
+
+        #  related configs
+        self.lw_st_ed = lw_st_ed
+        self.lw_video_ce = lw_video_ce
+        self.similarity_measure = similarity_measure
+
+        self.video_modality = ctx_mode.split("_")
+        logger.info("video modality : %s" % self.video_modality)
+        self.output_moe_weight = not no_output_moe_weight
+
+        hidden_dim = hidden_dim
+        base_bert_layer_config = config.bert_config
+
+        ## Backbone encoder
+        self.encoder = VideoQueryEncoder(config,video_modality=self.video_modality,
+                                         visual_dim=visual_dim,text_dim=text_dim,query_dim=query_dim,
+                                         hidden_dim=hidden_dim,split_num=video_len)
+
+        if self.output_moe_weight and len(self.video_modality) > 1:
+            self.query_weight = QueryWeightEncoder(config.netvlad_config,video_modality=self.video_modality)
+
+        ## Query_aware_feature_learning Module
+        self.query_aware_feature_learning_layer = BiDirectionalAttention(hidden_dim)
+
+        ## Shared transformer for both moment localization and video scoring heads
+        self.contextual_QAL_feature_learning = FCPlusTransformer(base_bert_layer_config,hidden_dim * 4)
+
+        ## Moment_localization_head
+        self.moment_localization_head = MomentLocalizationHead(config.moment_localization_config,base_bert_layer_config,hidden_dim)
+        self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean")
+
+        ## Optional video_scoring_head
+        if self.similarity_measure == "exclusive":
+            self.video_scoring_head = VideoScoringHead(config.video_scoring_config,base_bert_layer_config,hidden_dim)
+            self.score_ce = nn.CrossEntropyLoss(reduction="mean")
+
+        self.debug_model = use_debug
+        if self.debug_model:
+            logger.setLevel(level=logging.DEBUG)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        """ Initialize the weights."""
+
+        def re_init(module):
+            if isinstance(module, (nn.Linear, nn.Embedding)):
+                # Slightly different from the TF version which uses truncated_normal for initialization
+                # cf https://github.com/pytorch/pytorch/pull/5617
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+                #print("nn.Linear, nn.Embedding: ", module)
+            elif isinstance(module, nn.LayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            elif isinstance(module, nn.Conv1d):
+                module.reset_parameters()
+
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+
+        self.apply(re_init)
+
+
+    def compute_final_score(self,score_dict,moe_weights=None):
+
+        sample_key = list(score_dict.keys())[0]
+        final_query_context_scores = torch.zeros_like(score_dict[sample_key])
+        shape_size = len(score_dict[sample_key].shape)
+        if moe_weights is not None:
+            for mod in self.video_modality:
+                if shape_size == 2:
+                    final_query_context_scores += torch.einsum("nm,n->nm", score_dict[mod], moe_weights[mod])
+                elif shape_size == 3:
+                    final_query_context_scores += torch.einsum("nlm,n->nlm", score_dict[mod], moe_weights[mod])
+        else:
+            for mod in self.video_modality:
+                final_query_context_scores += torch.div(score_dict[mod], len(self.video_modality))
+
+        return final_query_context_scores
+
+
+    def get_pred_from_raw_query(self, batch):
+
+        ## Extract query and video feature through MMT backbone
+        _query_feature = self.encoder(batch, task="repr_query") #Widehat_Q
+
+        _video_feature_dict = self.encoder(batch, task="repr_video") #Widehat_V and #Widehat_S
+
+        ## Shared normalization technique
+        ## Use the same query feature for shared_video_num times
+        sample_key = list(_video_feature_dict.keys())[0]
+        query_batch = _query_feature.size()[0]
+        video_batch, video_len = _video_feature_dict[sample_key].size()[:2]
+        shared_video_num = int(video_batch / query_batch)
+
+        query_feature = torch.repeat_interleave(_query_feature, shared_video_num, dim=0)
+        query_mask = torch.repeat_interleave(batch["query"]["feat_mask"], shared_video_num, dim=0)
+
+
+        ## Compute Query Dependent Fusion video feature
+        if self.output_moe_weight and len(self.video_modality) > 1:
+            moe_weights_dict = self.query_weight(query_feature)
+            QDF_feature = self.compute_final_score(_video_feature_dict, moe_weights_dict)
+        else:
+            QDF_feature = self.compute_final_score(_video_feature_dict,None)
+
+        video_mask = batch["visual"]["feat_mask"]
+
+
+        ## Compute Query Aware Learning video feature
+        QAL_feature = self.query_aware_feature_learning_layer(QDF_feature, query_feature,
+                                video_mask,query_mask)
+
+        ## Contextualize QAL features
+        Contextual_QAL  = self.contextual_QAL_feature_learning(
+            features=QAL_feature,
+            feat_mask=video_mask)
+
+        G = torch.cat([QAL_feature,Contextual_QAL], dim=2)
+
+        ## Moment localization head
+        begin_score_distribution , end_score_distribution = self.moment_localization_head(G,Contextual_QAL,video_mask)
+        begin_score_distribution = begin_score_distribution.view(query_batch, shared_video_num, video_len)
+        end_score_distribution = end_score_distribution.view(query_batch, shared_video_num, video_len)
+
+        ## Optional video scoring head
+        video_similarity_score = None
+        if self.similarity_measure == "exclusive":
+            video_similarity_score = self.video_scoring_head(G,video_mask)
+            video_similarity_score = video_similarity_score.view(query_batch, shared_video_num)
+
+        return video_similarity_score, begin_score_distribution , end_score_distribution
+
+
+    def get_moment_loss_share_norm(self, begin_score_distribution, end_score_distribution ,st_ed_indices):
+
+        bs , shared_video_num , video_len = begin_score_distribution.size()
+
+        begin_score_distribution = begin_score_distribution.view(bs,-1)
+        end_score_distribution = end_score_distribution.view(bs,-1)
+
+        loss_st = self.temporal_criterion(begin_score_distribution, st_ed_indices[:, 0])
+        loss_ed = self.temporal_criterion(end_score_distribution, st_ed_indices[:, 1])
+        moment_ce_loss = loss_st + loss_ed
+
+        return moment_ce_loss
+
+
+    def forward(self,batch):
+
+        video_similarity_score, begin_score_distribution , end_score_distribution = \
+            self.get_pred_from_raw_query(batch)
+
+        moment_ce_loss, video_ce_loss = 0, 0
+
+        # moment cross-entropy loss
+        # if neg_video_num = 0, we do not sample negative videos
+        # the softmax operator is performed only for the ground-truth video
+        # which mean to not use shared normalization training objective
+        moment_ce_loss = self.get_moment_loss_share_norm(
+            begin_score_distribution, end_score_distribution, batch["st_ed_indices"])
+        moment_ce_loss = self.lw_st_ed * moment_ce_loss
+
+        if self.similarity_measure == "exclusive":
+            ce_label = batch["st_ed_indices"].new_zeros(video_similarity_score.size()[0])
+            video_ce_loss = self.score_ce(video_similarity_score, ce_label)
+            video_ce_loss = self.lw_video_ce*video_ce_loss
+
+
+        loss = moment_ce_loss + video_ce_loss
+        return loss, {"moment_ce_loss": float(moment_ce_loss),
+                      "video_ce_loss": float(video_ce_loss),
+                      "loss_overall": float(loss)}
+
+
+
+
diff --git a/model/head/__init__.py b/model/head/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model/head/ml_head.py b/model/head/ml_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8be34a4ef332e698bc4655999f1dda4f346135f4
--- /dev/null
+++ b/model/head/ml_head.py
@@ -0,0 +1,61 @@
+import torch
+from torch import nn
+import logging
+logger = logging.getLogger(__name__)
+
+
+from model.layers import FCPlusTransformer, ConvSE
+
+
+class MomentLocalizationHead(nn.Module):
+    """
+        Moment localization head model
+    """
+
+    def __init__(self, config,base_bert_layer_config,hidden_dim):
+        super(MomentLocalizationHead, self).__init__()
+
+        base_bert_layer_config = base_bert_layer_config
+        hidden_dim = hidden_dim
+
+        self.begin_feature_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 5)
+
+        self.end_feature_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 2)
+
+        self.begin_score_modeling = ConvSE(config)
+        self.end_score_modeling = ConvSE(config)
+
+    def forward(self, G, Contextual_QAL, video_mask):
+        """
+        Inputs:
+            :param contextual_qal_features: (batch, feat_size, L_v)
+            :param video_mask: (batch, L_v)
+        Return:
+             score: (begin or end) score distribution
+        """
+        ## OUTPUT LAYER
+        begin_features = self.begin_feature_modeling(
+            features=G,
+            feat_mask=video_mask)
+
+        end_features = self.end_feature_modeling(
+            features=torch.cat([Contextual_QAL, begin_features], dim=2),
+            feat_mask=video_mask)
+
+        ## Un-normalized
+        begin_input_feature = torch.transpose(begin_features, 1, 2)
+        end_input_feature = torch.transpose(end_features, 1, 2)
+
+        begin_score_distribution = self.begin_score_modeling(
+            contextual_qal_features=begin_input_feature,
+            video_mask=video_mask,
+        )
+
+        end_score_distribution = self.end_score_modeling(
+            contextual_qal_features=end_input_feature,
+            video_mask=video_mask,
+        )
+
+        return begin_score_distribution , end_score_distribution
+
+
diff --git a/model/head/vs_head.py b/model/head/vs_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a94c3a4bb5c9f85dd0250a82a7fcb8124775de3d
--- /dev/null
+++ b/model/head/vs_head.py
@@ -0,0 +1,42 @@
+import torch
+from torch import nn
+
+import logging
+logger = logging.getLogger(__name__)
+
+from model.layers import FCPlusTransformer
+
+class VideoScoringHead(nn.Module):
+    """
+         Video Scoring Head
+    """
+
+    def __init__(self, config,base_bert_layer_config,hidden_dim):
+        super(VideoScoringHead, self).__init__()
+
+        base_bert_layer_config = base_bert_layer_config
+        hidden_dim = hidden_dim
+
+
+        self.video_feature_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 5)
+
+        self.video_score_predictor = nn.Sequential(
+            nn.Linear(**config.linear_1_cfg),
+            nn.ReLU(),
+            nn.Linear(**config.linear_2_cfg)
+        )
+
+
+    def forward(self, G, video_mask):
+
+
+        ## Contexual_QAL_feature for video scoring
+        R = self.video_feature_modeling(
+            features=G,
+            feat_mask=video_mask)
+
+        holistic_video_feature, _ = torch.max(R, dim=1)
+
+        video_similarity_score = self.video_score_predictor(holistic_video_feature.squeeze(1)) # r
+
+        return video_similarity_score
\ No newline at end of file
diff --git a/model/layers.py b/model/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1001996c43ced2fb6f5216cef166299b197447
--- /dev/null
+++ b/model/layers.py
@@ -0,0 +1,196 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import logging
+
+logger = logging.getLogger(__name__)
+try:
+  import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm
+except (ImportError, AttributeError) as e:
+  BertLayerNorm = torch.nn.LayerNorm
+
+from model.transformer.bert import BertEncoder
+from model.modeling_utils import mask_logits
+
+class LinearLayer(nn.Module):
+    """linear layer configurable with layer normalization, dropout, ReLU."""
+    def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True,tanh=False):
+        super(LinearLayer, self).__init__()
+        self.relu = relu
+        self.tanh = tanh
+        self.layer_norm = layer_norm
+        if layer_norm:
+            self.LayerNorm = BertLayerNorm(in_hsz)
+        layers = [
+            nn.Dropout(dropout),
+            nn.Linear(in_hsz, out_hsz)
+        ]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """(N, L, D)"""
+        if self.layer_norm:
+            x = self.LayerNorm(x)
+        x = self.net(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        if self.tanh:
+            x = torch.tanh(x)
+        return x  # (N, L, D)
+
+
+class NetVLAD(nn.Module):
+    def __init__(self, cluster_size, feature_size, add_norm=True):
+        super(NetVLAD, self).__init__()
+        self.feature_size = feature_size
+        self.cluster_size = cluster_size
+        self.clusters = nn.Parameter((1 / math.sqrt(feature_size))
+                                     * torch.randn(feature_size, cluster_size))
+        self.clusters2 = nn.Parameter((1 / math.sqrt(feature_size))
+                                      * torch.randn(1, feature_size, cluster_size))
+
+        self.add_norm = add_norm
+        self.LayerNorm = BertLayerNorm(cluster_size)
+        self.out_dim = cluster_size * feature_size
+
+    def forward(self, x):
+        max_sample = x.size()[1]
+        x = x.view(-1, self.feature_size)
+        assignment = torch.matmul(x, self.clusters)
+
+        if self.add_norm:
+            assignment = self.LayerNorm(assignment)
+
+        assignment = F.softmax(assignment, dim=1)
+        assignment = assignment.view(-1, max_sample, self.cluster_size)
+
+        a_sum = torch.sum(assignment, -2, keepdim=True)
+        a = a_sum * self.clusters2
+
+        assignment = assignment.transpose(1, 2)
+
+        x = x.view(-1, max_sample, self.feature_size)
+        vlad = torch.matmul(assignment, x)
+        vlad = vlad.transpose(1, 2)
+        vlad = vlad - a
+
+        # L2 intra norm
+        vlad = F.normalize(vlad)
+
+        # flattening + L2 norm
+        vlad = vlad.reshape(-1, self.cluster_size * self.feature_size)
+        vlad = F.normalize(vlad)
+
+        return vlad
+
+
+class FCPlusTransformer(nn.Module):
+    """
+        FC + Transformer
+        FC layer reduces input feature size into hidden size
+        Transformer contextualizes QAL feature
+    """
+
+    def __init__(self, config,input_dim):
+        super(FCPlusTransformer, self).__init__()
+        self.trans_linear = LinearLayer(
+            in_hsz=input_dim, out_hsz=config.hidden_size)
+        self.encoder = BertEncoder(config)
+
+    def forward(self,features, feat_mask):
+        """
+        Inputs:
+            :param contextual_qal_features: (batch, L_v, input_dim)
+            :param feat_mask: (batch, L_v)
+        Return:
+            sequence_output: (batch, L_v, hidden_size)
+        """
+        transformed_features = self.trans_linear(features)
+
+        encoder_outputs = self.encoder(transformed_features, feat_mask)
+
+        sequence_output = encoder_outputs[0]
+
+        return sequence_output
+
+
+class ConvSE(nn.Module):
+    """
+        ConvSE module
+    """
+    def __init__(self, config):
+        super(ConvSE, self).__init__()
+
+        self.clip_score_predictor = nn.Sequential(
+            nn.Conv1d(**config.conv_cfg_1),
+            nn.ReLU(),
+            nn.Conv1d(**config.conv_cfg_2),
+        )
+
+
+    def forward(self, contextual_qal_features, video_mask):
+        """
+        Inputs:
+            :param contextual_qal_features: (batch, feat_size, L_v)
+            :param video_mask: (batch, L_v)
+        Return:
+             score: (begin or end) score distribution
+        """
+        score = self.clip_score_predictor(contextual_qal_features).squeeze(1) #(batch, L_v)
+
+        score = mask_logits(score, video_mask)  #(batch, L_v)
+
+        return score
+
+
+class MomentLocalizationHead(nn.Module):
+    """
+        Moment localization head model
+    """
+
+    def __init__(self, config,base_bert_layer_config,hidden_dim):
+        super(MomentLocalizationHead, self).__init__()
+
+        base_bert_layer_config = base_bert_layer_config
+        hidden_dim = hidden_dim
+
+        self.start_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 5)
+
+        self.end_modeling = FCPlusTransformer(base_bert_layer_config, hidden_dim * 2)
+
+        self.start_reader = ConvSE(config)
+        self.end_reader = ConvSE(config)
+
+    def forward(self, G, Contextual_QAL, video_mask):
+        """
+        Inputs:
+            :param contextual_qal_features: (batch, feat_size, L_v)
+            :param video_mask: (batch, L_v)
+        Return:
+             score: (begin or end) score distribution
+        """
+        ## OUTPUT LAYER
+        start_features = self.start_modeling(
+            features=G,
+            feat_mask=video_mask)
+
+        end_features = self.end_modeling(
+            features=torch.cat([Contextual_QAL, start_features], dim=2),
+            feat_mask=video_mask)
+
+        ## Un-normalized
+        start_reader_input_feature = torch.transpose(start_features, 1, 2)
+        end_reader_input_feature = torch.transpose(end_features, 1, 2)
+
+        reader_st_prob = self.start_reader(
+            contextual_qal_features=start_reader_input_feature,
+            video_mask=video_mask,
+        )
+
+        reader_ed_prob = self.end_reader(
+            contextual_qal_features=end_reader_input_feature,
+            video_mask=video_mask,
+        )
+
+        return reader_st_prob,reader_ed_prob
diff --git a/model/modeling_utils.py b/model/modeling_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e3c10d22fc50aa780fe4a9d7d81844e9401b0b
--- /dev/null
+++ b/model/modeling_utils.py
@@ -0,0 +1,135 @@
+"""
+Copyright (c) Microsoft Corporation.
+Licensed under the MIT license.
+
+some functions are modified from HuggingFace
+(https://github.com/huggingface/transformers)
+"""
+import torch
+from torch import nn
+import logging
+logger = logging.getLogger(__name__)
+
+
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters)
+        to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(
+        new_size[1], new_size[0], bias=layer.bias is not None).to(
+            layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def mask_logits(target, mask, eps=-1e4):
+    return target * mask + (1 - mask) * eps
+
+
+def load_partial_checkpoint(checkpoint, n_layers, skip_layers=True):
+    if skip_layers:
+        new_checkpoint = {}
+        gap = int(12/n_layers)
+        prefix = "roberta.encoder.layer."
+        layer_range = {str(l): str(i) for i, l in enumerate(
+            list(range(gap-1, 12, gap)))}
+        for k, v in checkpoint.items():
+            if prefix in k:
+                layer_name = k.split(".")
+                layer_num = layer_name[3]
+                if layer_num in layer_range:
+                    layer_name[3] = layer_range[layer_num]
+                    new_layer_name = ".".join(layer_name)
+                    new_checkpoint[new_layer_name] = v
+            else:
+                new_checkpoint[k] = v
+    else:
+        new_checkpoint = checkpoint
+    return new_checkpoint
+
+
+def load_pretrained_weight(model, state_dict):
+    # Load from a PyTorch state_dict
+    old_keys = []
+    new_keys = []
+    for key in state_dict.keys():
+        new_key = None
+        if 'gamma' in key:
+            new_key = key.replace('gamma', 'weight')
+        if 'beta' in key:
+            new_key = key.replace('beta', 'bias')
+        if new_key:
+            old_keys.append(key)
+            new_keys.append(new_key)
+    for old_key, new_key in zip(old_keys, new_keys):
+        state_dict[new_key] = state_dict.pop(old_key)
+
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    def load(module, prefix=''):
+        local_metadata = ({} if metadata is None
+                          else metadata.get(prefix[:-1], {}))
+        module._load_from_state_dict(
+            state_dict, prefix, local_metadata, True, missing_keys,
+            unexpected_keys, error_msgs)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+    start_prefix = ''
+    if not hasattr(model, 'roberta') and\
+            any(s.startswith('roberta.') for s in state_dict.keys()):
+        start_prefix = 'roberta.'
+
+    load(model, prefix=start_prefix)
+    if len(missing_keys) > 0:
+        logger.info("Weights of {} not initialized from "
+                    "pretrained model: {}".format(
+                        model.__class__.__name__, missing_keys))
+    if len(unexpected_keys) > 0:
+        logger.info("Weights from pretrained model not used in "
+                    "{}: {}".format(
+                        model.__class__.__name__, unexpected_keys))
+    if len(error_msgs) > 0:
+        raise RuntimeError('Error(s) in loading state_dict for '
+                            '{}:\n\t{}'.format(
+                                model.__class__.__name__,
+                                "\n\t".join(error_msgs)))
+    return model
+
+
+def pad_tensor_to_mul(tensor, dim=0, mul=8):
+    """ pad tensor to multiples (8 for tensor cores) """
+    t_size = list(tensor.size())
+    n_pad = mul - t_size[dim] % mul
+    if n_pad == mul:
+        n_pad = 0
+        padded_tensor = tensor
+    else:
+        t_size[dim] = n_pad
+        pad = torch.zeros(*t_size, dtype=tensor.dtype, device=tensor.device)
+        padded_tensor = torch.cat([tensor, pad], dim=dim)
+    return padded_tensor, n_pad
diff --git a/model/qal/__init__.py b/model/qal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model/qal/query_aware_learning_module.py b/model/qal/query_aware_learning_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd64822b49e5d2c1591c0582fea8f081cea68672
--- /dev/null
+++ b/model/qal/query_aware_learning_module.py
@@ -0,0 +1,92 @@
+import torch
+from torch import nn
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+  import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm
+except (ImportError, AttributeError) as e:
+  BertLayerNorm = torch.nn.LayerNorm
+
+from utils.model_utils import mask_logits
+import torch.nn.functional as F
+
+
+class BiDirectionalAttention(nn.Module):
+    """
+         Bi-directional attention flow
+         Perform query-to-video attention (Q2V) and video-to-query attention (V2Q)
+         Append QDF features with a set of query-aware features to form QAL feature
+    """
+
+    def __init__(self, video_dim):
+        super(BiDirectionalAttention, self).__init__()
+        ## Core Attention for query-aware feature learining
+        self.similarity_weight = nn.Linear(video_dim * 3, 1, bias=False)
+
+
+    def forward(self, QDF_emb, query_emb,video_mask, query_mask):
+        """
+        Inputs:
+        :param QDF_emb: (batch, L_v, feat_size)
+        :param query_emb: (batch, L_q, feat_size)
+        :param video_mask: (batch, L_v)
+        :param query_mask: (batch, L_q)
+        Return:
+        QAL: (batch, L_v, feat_size*4)
+        """
+
+        ## CREATE SIMILARITY MATRIX
+        video_len = QDF_emb.size()[1]
+        query_len = query_emb.size()[1]
+
+        _QDF_emb = QDF_emb.unsqueeze(2).repeat(1, 1, query_len, 1)
+        # [bs, video_len, 1, feat_size] => [bs, video_len, query_len, feat_size]
+
+        _query_emb = query_emb.unsqueeze(1).repeat(1, video_len, 1, 1)
+        # [bs, 1, query_len, feat_size] => [bs, video_len, query_len, feat_size]
+
+        elementwise_prod = torch.mul(_QDF_emb, _query_emb)
+        # [bs, video_len, query_len, feat_size]
+
+        alpha = torch.cat([_QDF_emb, _query_emb, elementwise_prod], dim=3)
+        # [bs, video_len, query_len, feat_size*3]
+
+        similarity_matrix = self.similarity_weight(alpha).view(-1, video_len, query_len)
+
+        similarity_matrix_mask = torch.einsum("bn,bm->bnm", video_mask, query_mask)
+        # [bs, video_len, query_len]
+
+        ## CALCULATE Video2Query ATTENTION
+
+        a = F.softmax(mask_logits(similarity_matrix,
+                                  similarity_matrix_mask), dim=-1)
+        # [bs, video_len, query_len]
+
+        V2Q = torch.bmm(a, query_emb)
+        # [bs] ([video_len, query_len] X [query_len, feat_size]) => [bs, video_len, feat_size]
+
+        ## CALCULATE Query2Video ATTENTION
+
+        b = F.softmax(torch.max(mask_logits(similarity_matrix, similarity_matrix_mask), 2)[0], dim=-1)
+        # [bs, video_len]
+
+        b = b.unsqueeze(1)
+        # [bs, 1, video_len]
+
+        Q2V = torch.bmm(b, QDF_emb)
+        # [bs] ([bs, 1, video_len] X [bs, video_len, feat_size]) => [bs, 1, feat_size]
+
+        Q2V = Q2V.repeat(1, video_len, 1)
+        # [bs, video_len, feat_size]
+
+        ## Append QDF_emb with three query-aware features
+
+        QAL = torch.cat([QDF_emb, V2Q,
+                         torch.mul(QDF_emb, V2Q),
+                         torch.mul(QDF_emb, Q2V)], dim=2)
+
+        # [bs, video_len, feat_size*4]
+
+        return QAL
\ No newline at end of file
diff --git a/model/transformer/__init__.py b/model/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model/transformer/bert.py b/model/transformer/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d49740bc70c765020c6f7856115b7228a4dd75c
--- /dev/null
+++ b/model/transformer/bert.py
@@ -0,0 +1,275 @@
+"""
+BERT/RoBERTa layers from the huggingface implementation
+(https://github.com/huggingface/transformers)
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.modeling_utils import prune_linear_layer
+import math
+import logging
+logger = logging.getLogger(__name__)
+try:
+  import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm
+except (ImportError, AttributeError) as e:
+  BertLayerNorm = torch.nn.LayerNorm
+
+
+def gelu(x):
+    """ Original Implementation of the gelu activation function
+        in Google Bert repo when initialy created.
+        For information: OpenAI GPT's gelu is slightly different
+        (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi)
+            * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently
+        in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (
+        1 + torch.tanh(
+            math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {
+    "gelu": gelu,
+    "relu": torch.nn.functional.relu,
+    "swish": swish, "gelu_new": gelu_new}
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of "
+                "the number of attention heads (%d)" % (
+                    config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(
+            config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads *\
+            self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query"
+        # and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(
+            query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is
+            # (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs)\
+            if self.output_attentions else (context_layer,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(
+            self.self.num_attention_heads, self.self.attention_head_size)
+        # Convert to set and emove already pruned heads
+        heads = set(heads) - self.pruned_heads
+        for head in heads:
+            # Compute how many pruned heads are
+            # before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size =\
+            self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
+        self_outputs = self.self(input_tensor, attention_mask, head_mask)
+        attention_output = self.output(self_outputs[0], input_tensor)
+        # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        attention_outputs = self.attention(
+            hidden_states, attention_mask, head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        # add attentions if we output them
+        outputs = (layer_output,) + attention_outputs[1:]
+        return outputs
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(
+            config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to
+        # [batch_size, num_heads, from_seq_length, to_seq_length]
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states, extended_attention_mask, None)
+            hidden_states = layer_outputs[0]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        # last-layer hidden state, (all hidden states), (all attentions)
+        return outputs
diff --git a/model/transformer/bert_embed.py b/model/transformer/bert_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..73751ebda8f2342d3159e4fd893c34d03c900d13
--- /dev/null
+++ b/model/transformer/bert_embed.py
@@ -0,0 +1,64 @@
+"""
+Input Embedding Layers
+"""
+import torch
+import torch.nn as nn
+import logging
+
+
+logger = logging.getLogger(__name__)
+try:
+  import apex.normalization.fused_layer_norm.FusedLayerNorm as BertLayerNorm
+except (ImportError, AttributeError) as e:
+  logger.info(
+      "Better speed can be achieved with apex installed from "
+      "https://www.github.com/nvidia/apex ."
+  )
+  BertLayerNorm = torch.nn.LayerNorm
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        #self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
diff --git a/ndcg_iou_topk.py b/ndcg_iou_topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..b35d646c57473d1e3b6eb5550bcb6a1845ebd678
--- /dev/null
+++ b/ndcg_iou_topk.py
@@ -0,0 +1,66 @@
+from utils.basic_utils import load_jsonl, save_jsonl, load_json
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from collections import defaultdict
+import copy
+
+def calculate_iou(pred_start: float, pred_end: float, gt_start: float, gt_end: float) -> float:
+    intersection_start = max(pred_start, gt_start)
+    intersection_end = min(pred_end, gt_end)
+    intersection = max(0, intersection_end - intersection_start)
+    union = (pred_end - pred_start) + (gt_end - gt_start) - intersection
+    return intersection / union if union > 0 else 0
+
+
+# Function to calculate DCG
+def calculate_dcg(scores):
+    return sum((2**score - 1) / np.log2(idx + 2) for idx, score in enumerate(scores))
+
+# Function to calculate NDCG
+def calculate_ndcg(pred_scores, true_scores):
+    dcg = calculate_dcg(pred_scores)
+    idcg = calculate_dcg(sorted(true_scores, reverse=True))
+    return dcg / idcg if idcg > 0 else 0
+
+
+
+def calculate_ndcg_iou(all_gt, all_pred, TS, KS):
+    performance = defaultdict(lambda: defaultdict(list))
+    performance_avg = defaultdict(lambda: defaultdict(float))
+    for k in tqdm(all_pred.keys(), desc="Calculate NDCG"):
+        one_pred = all_pred[k]
+        one_gt = all_gt[k]  
+
+        one_gt.sort(key=lambda x: x["relevance"], reverse=True)
+        for T in TS:
+            one_gt_drop = copy.deepcopy(one_gt)  
+            predictions_with_scores = []
+            
+            for pred in one_pred:
+                pred_video_name, pred_time = pred["video_name"], pred["timestamp"]
+                matched_rows = [gt for gt in one_gt_drop if gt["video_name"] == pred_video_name]
+                if not matched_rows:
+                    pred["pred_relevance"] = 0
+                else:
+                    ious = [calculate_iou(pred_time[0], pred_time[1], gt["timestamp"][0], gt["timestamp"][1]) for gt in matched_rows]
+                    max_iou_idx = np.argmax(ious)
+                    max_iou_row = matched_rows[max_iou_idx]
+                    
+                    if ious[max_iou_idx] > T:
+                        pred["pred_relevance"] = max_iou_row["relevance"]
+                        # Remove the matched ground truth row
+                        original_idx = one_gt_drop.index(max_iou_row)
+                        one_gt_drop.pop(original_idx)
+                    else:
+                        pred["pred_relevance"] = 0
+                predictions_with_scores.append(pred)
+            for K in KS:
+                true_scores = [gt["relevance"] for gt in one_gt][:K]
+                pred_scores = [pred["pred_relevance"] for pred in predictions_with_scores][:K]
+                ndcg_score = calculate_ndcg(pred_scores, true_scores)
+                performance[K][T].append(ndcg_score)
+    for K, vs in performance.items():
+        for T, v in vs.items():
+            performance_avg[K][T] = np.mean(v)
+    return performance_avg
diff --git a/optim/adamw.py b/optim/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..485d0570fd8be0b4d70c5d39e231e8e611e9ee54
--- /dev/null
+++ b/optim/adamw.py
@@ -0,0 +1,106 @@
+"""
+AdamW optimizer (weight decay fix)
+originally from hugginface (https://github.com/huggingface/transformers).
+
+Copied from UNITER
+(https://github.com/ChenRocks/UNITER)
+"""
+import math
+
+import torch
+from torch.optim import Optimizer
+
+
+class AdamW(Optimizer):
+    """ Implements Adam algorithm with weight decay fix.
+    Parameters:
+        lr (float): learning rate. Default 1e-3.
+        betas (tuple of 2 floats): Adams beta parameters (b1, b2).
+            Default: (0.9, 0.999)
+        eps (float): Adams epsilon. Default: 1e-6
+        weight_decay (float): Weight decay. Default: 0.0
+        correct_bias (bool): can be set to False to avoid correcting bias
+            in Adam (e.g. like in Bert TF repository). Default True.
+    """
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
+                 weight_decay=0.0, correct_bias=True):
+        if lr < 0.0:
+            raise ValueError(
+                "Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - "
+                             "should be in [0.0, 1.0[".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - "
+                             "should be in [0.0, 1.0[".format(betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {} - "
+                             "should be >= 0.0".format(eps))
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        correct_bias=correct_bias)
+        super(AdamW, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'Adam does not support sparse '
+                        'gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(grad , alpha=1.0 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                step_size = group['lr']
+                if group['correct_bias']:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1 ** state['step']
+                    bias_correction2 = 1.0 - beta2 ** state['step']
+                    step_size = (step_size * math.sqrt(bias_correction2)
+                                 / bias_correction1)
+
+                p.data.addcdiv_(exp_avg, denom, value=-step_size)
+
+                # Just adding the square of the weights to the loss function is
+                # *not* the correct way of using L2 regularization/weight decay
+                # with Adam, since that will interact with the m and v
+                # parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't
+                # interact with the m/v parameters. This is equivalent to
+                # adding the square of the weights to the loss with plain
+                # (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                if group['weight_decay'] > 0.0:
+                    p.data.add_(p.data, alpha=-group['lr'] * group['weight_decay'])
+
+        return loss
diff --git a/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01.log b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01.log
new file mode 100644
index 0000000000000000000000000000000000000000..1053dba705d5253f44765cecffddc72e9cede8b4
--- /dev/null
+++ b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4d870ccff8ab61b72571cd7c9f84eb916d84fd7f091b2e300dfb9d4be5ee518
+size 29628
diff --git a/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01_back.log b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01_back.log
new file mode 100644
index 0000000000000000000000000000000000000000..86854b9f67defe55568d8cf31987a26891f8c952
--- /dev/null
+++ b/results/tvr-top01-2024_07_08_17_18_30/20240708_171830_conquer_top01_back.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef85a542568c80fab7d57d69041ebd898e30d4fc912082bd4d571aea3ec6424c
+size 29917
diff --git a/results/tvr-top01-2024_07_08_17_18_30/best_test_predictions.json b/results/tvr-top01-2024_07_08_17_18_30/best_test_predictions.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b6aacaf7ac7175a22af8df0248032f6c0ed6577
--- /dev/null
+++ b/results/tvr-top01-2024_07_08_17_18_30/best_test_predictions.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0becb2747c635a0080149ccb3e92975f7bf4bf3a99d025fd41d29ae9287db438
+size 14263264
diff --git a/results/tvr-top01-2024_07_08_17_18_30/best_val_predictions.json b/results/tvr-top01-2024_07_08_17_18_30/best_val_predictions.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc97f037c6ff1aea91b6954fed138ecd48459920
--- /dev/null
+++ b/results/tvr-top01-2024_07_08_17_18_30/best_val_predictions.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47ced0079b54bdbc05268645d80c6fa52b1ed44c6e04f6922d535be29aa3fd8c
+size 2560976
diff --git a/results/tvr-top01-2024_07_08_17_18_30/code.zip b/results/tvr-top01-2024_07_08_17_18_30/code.zip
new file mode 100644
index 0000000000000000000000000000000000000000..58dd71a9e4af908ac0eda92c20785c87a430e792
--- /dev/null
+++ b/results/tvr-top01-2024_07_08_17_18_30/code.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88b0711364459d5340f2e887420295145188a9008d5b50b5ddde46b221645c23
+size 1141392
diff --git a/results/tvr-top01-2024_07_08_17_18_30/model.ckpt b/results/tvr-top01-2024_07_08_17_18_30/model.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..2eb63a9bff37b0aff8d11b4e3a2d3d40b19c17e5
--- /dev/null
+++ b/results/tvr-top01-2024_07_08_17_18_30/model.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa2b8044636fe7ce9ab4d36df179ec2358f10a579de4ee5a7e58f338553558d2
+size 190742082
diff --git a/results/tvr-top01-2024_07_08_17_18_30/opt.json b/results/tvr-top01-2024_07_08_17_18_30/opt.json
new file mode 100644
index 0000000000000000000000000000000000000000..f659161dbce9315db3afb6f07f436bdc09437da6
--- /dev/null
+++ b/results/tvr-top01-2024_07_08_17_18_30/opt.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c93c28739229f5e35afc1239e1f30e0cad28353909eed88b6d65732943a5ac61
+size 1370
diff --git a/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20.log b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20.log
new file mode 100644
index 0000000000000000000000000000000000000000..f02f07187a75772c6e33e37b9119a9c2a2de1a9a
--- /dev/null
+++ b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea621825b2f1d618daf456f872246d6d50bd3729a36606c7cdcf75dcddbec57a
+size 30298
diff --git a/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20_back.log b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20_back.log
new file mode 100644
index 0000000000000000000000000000000000000000..70e4442166fc357908c393d6e759de13925e62b5
--- /dev/null
+++ b/results/tvr-top20-2024_07_08_21_19_47/20240708_211947_conquer_top20_back.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03b9976e0b0049f434e91251cfcde27b9a2334e95216d995ada4699f83d889c9
+size 31752
diff --git a/results/tvr-top20-2024_07_08_21_19_47/best_test_predictions.json b/results/tvr-top20-2024_07_08_21_19_47/best_test_predictions.json
new file mode 100644
index 0000000000000000000000000000000000000000..5639876bebc87d566824ea9638bb335433c70eba
--- /dev/null
+++ b/results/tvr-top20-2024_07_08_21_19_47/best_test_predictions.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12895f4d15d70eff1737745bda045cf6fb1bf6e85aa4e8c4cdd86633cb70274a
+size 14324579
diff --git a/results/tvr-top20-2024_07_08_21_19_47/best_val_predictions.json b/results/tvr-top20-2024_07_08_21_19_47/best_val_predictions.json
new file mode 100644
index 0000000000000000000000000000000000000000..de0d7b9bb06dc200822c0064147bd717f9fcc4cf
--- /dev/null
+++ b/results/tvr-top20-2024_07_08_21_19_47/best_val_predictions.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:103076d328e1b7efdc2773625c38fc73a29492a67bcb27e023af73f8b21c8732
+size 2571786
diff --git a/results/tvr-top20-2024_07_08_21_19_47/code.zip b/results/tvr-top20-2024_07_08_21_19_47/code.zip
new file mode 100644
index 0000000000000000000000000000000000000000..58dd71a9e4af908ac0eda92c20785c87a430e792
--- /dev/null
+++ b/results/tvr-top20-2024_07_08_21_19_47/code.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88b0711364459d5340f2e887420295145188a9008d5b50b5ddde46b221645c23
+size 1141392
diff --git a/results/tvr-top20-2024_07_08_21_19_47/model.ckpt b/results/tvr-top20-2024_07_08_21_19_47/model.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..c9a590306d62773c881a69ba31beb6f6a2e46775
--- /dev/null
+++ b/results/tvr-top20-2024_07_08_21_19_47/model.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baff5eaebb7f211640af4e21f2876be344eaa95431ab32398ac7260e9803471f
+size 190742082
diff --git a/results/tvr-top20-2024_07_08_21_19_47/opt.json b/results/tvr-top20-2024_07_08_21_19_47/opt.json
new file mode 100644
index 0000000000000000000000000000000000000000..0176d847c25f14ba27dfd1451356bb0b6da1d651
--- /dev/null
+++ b/results/tvr-top20-2024_07_08_21_19_47/opt.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90d02a58cbb9a5ea0f23e3fefedd3f8f7b8852332b4877cfe7ba2833ca699071
+size 1368
diff --git a/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log
new file mode 100644
index 0000000000000000000000000000000000000000..b9f1268610a5c735972e7a48f929a783e0f3e028
--- /dev/null
+++ b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:895455a13565da5f3d44126722152288a3057649fef1daa94d7558d490d97d81
+size 24491
diff --git a/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40_back.log b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40_back.log
new file mode 100644
index 0000000000000000000000000000000000000000..f7ad9c63e1627cded243fb06478e3b70fb6e27e9
--- /dev/null
+++ b/results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40_back.log
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6085e3055b53b0afc63799813027a70b1d1999beeecf22b0accda3b5a60fe8cc
+size 26137
diff --git a/results/tvr-top40-2024_07_11_10_58_46/best_test_predictions.json b/results/tvr-top40-2024_07_11_10_58_46/best_test_predictions.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0a434614b7ee800f239c5e6a76019d5c26e4721
--- /dev/null
+++ b/results/tvr-top40-2024_07_11_10_58_46/best_test_predictions.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5deaab54d6eec95172c5877b38dc72712f76b0357f26e255938a55835627ed2c
+size 14329598
diff --git a/results/tvr-top40-2024_07_11_10_58_46/best_val_predictions.json b/results/tvr-top40-2024_07_11_10_58_46/best_val_predictions.json
new file mode 100644
index 0000000000000000000000000000000000000000..98fabefdbf27f245fa101dbc8eabe5b50a71b003
--- /dev/null
+++ b/results/tvr-top40-2024_07_11_10_58_46/best_val_predictions.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9d7b68cde82958c1a7039210d2ac4bb5cfb5083abee6bbb550083395061a8a8
+size 2572649
diff --git a/results/tvr-top40-2024_07_11_10_58_46/code.zip b/results/tvr-top40-2024_07_11_10_58_46/code.zip
new file mode 100644
index 0000000000000000000000000000000000000000..54e30461b2f77e125a11b55c9432c386f9d694ad
--- /dev/null
+++ b/results/tvr-top40-2024_07_11_10_58_46/code.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88e51fa09336f4a4545dc2e281cfe8cea943daf17de87c12b6b75d226fdb61dd
+size 1141399
diff --git a/results/tvr-top40-2024_07_11_10_58_46/model.ckpt b/results/tvr-top40-2024_07_11_10_58_46/model.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..972140053e36fd7f042646e5c34b5b2e871e5674
--- /dev/null
+++ b/results/tvr-top40-2024_07_11_10_58_46/model.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eba8e53656fed1ddcbb7d8129bd6c72862797c63684f11121a9a78c86b30c70
+size 190742082
diff --git a/results/tvr-top40-2024_07_11_10_58_46/opt.json b/results/tvr-top40-2024_07_11_10_58_46/opt.json
new file mode 100644
index 0000000000000000000000000000000000000000..b224f2b0c6d74eb320f9825a406b104cd8e2078f
--- /dev/null
+++ b/results/tvr-top40-2024_07_11_10_58_46/opt.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e03b5de0524d803c796aaef3fa4aaf1152cfae63644403e236262fe1a4663b3
+size 1368
diff --git a/run_disjoint_top01.sh b/run_disjoint_top01.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3d4108989ee2e7b4325c9985aeb4de98721898ee
--- /dev/null
+++ b/run_disjoint_top01.sh
@@ -0,0 +1,19 @@
+python train.py \
+    --model_name conquer \
+    --dataset_config config/tvr_ranking_data_config_top01.json \
+    --model_config config/model_config.json \
+    --eval_tasks_at_training VCMR \
+    --use_interal_vr_scores \
+    --use_extend_pool 500 \
+    --neg_video_num 0 \
+    --max_vcmr_video 10 \
+    --similarity_measure disjoint \
+    --bsz 196 \
+    --eval_query_bsz 8 \
+    --eval_num_per_epoch 0.05 \
+    --n_epoch 4000 \
+    --exp_id top01
+
+    # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8
+    # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_disjoint_top01.sh
+
diff --git a/run_disjoint_top20.sh b/run_disjoint_top20.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8d86aa0d9761a30974f0f7ccb52b0d0c2cdc0386
--- /dev/null
+++ b/run_disjoint_top20.sh
@@ -0,0 +1,19 @@
+python train.py \
+    --model_name conquer \
+    --dataset_config config/tvr_ranking_data_config_top20.json \
+    --model_config config/model_config.json \
+    --eval_tasks_at_training VCMR \
+    --use_interal_vr_scores \
+    --use_extend_pool 500 \
+    --neg_video_num 0 \
+    --max_vcmr_video 10 \
+    --similarity_measure disjoint \
+    --bsz 196 \
+    --eval_query_bsz 8 \
+    --eval_num_per_epoch 1 \
+    --n_epoch 200 \
+    --exp_id top20
+
+    # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8
+    # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_disjoint_top20.sh
+
diff --git a/run_disjoint_top40.sh b/run_disjoint_top40.sh
new file mode 100644
index 0000000000000000000000000000000000000000..067287d8b3d6aa601c9cc7f09a719f3283431668
--- /dev/null
+++ b/run_disjoint_top40.sh
@@ -0,0 +1,19 @@
+python train.py \
+    --model_name conquer \
+    --dataset_config config/tvr_ranking_data_config_top40.json \
+    --model_config config/model_config.json \
+    --eval_tasks_at_training VCMR \
+    --use_interal_vr_scores \
+    --use_extend_pool 500 \
+    --neg_video_num 0 \
+    --max_vcmr_video 10 \
+    --similarity_measure disjoint \
+    --bsz 196 \
+    --eval_query_bsz 8 \
+    --eval_num_per_epoch 2 \
+    --n_epoch 100 \
+    --exp_id top40
+
+    # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8
+    # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_disjoint_top40.sh
+
diff --git a/standalone_eval/__init__.py b/standalone_eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/standalone_eval/eval.py b/standalone_eval/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..164a18e0aa40e7973cade49f4f4dd24fcf497678
--- /dev/null
+++ b/standalone_eval/eval.py
@@ -0,0 +1,300 @@
+"""
+Load prediction file and GT file to calculate TVR metrics:
+- recall at top K (R@K), for a specified IoU, where K in [1, 5, 10, 100], IoU in [0.5, 0.7]
+"""
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict, defaultdict
+
+
+def load_json(filename):
+    with open(filename, "r") as f:
+        return json.load(f)
+
+
+def load_jsonl(filename):
+    with open(filename, "r") as f:
+        return [json.loads(l.strip("\n")) for l in f.readlines()]
+
+
+def pad_sequences_1d_np(sequences, dtype=np.float32):
+
+    """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
+    into a (n+1)-d array, only allow the first dim has variable lengths.
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: np.dtype or torch.dtype
+    Returns:
+        padded_seqs: ((n+1)-d tensor) padded with zeros
+        mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
+              1 indicate valid, 0 otherwise
+    Examples:
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=np.float32)
+        >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=np.float32)
+    """
+    if isinstance(sequences[0], list):
+        sequences = [np.asarray(s, dtype=dtype) for s in sequences]
+
+    extra_dims = sequences[0].shape[1:]  # the extra dims should be the same for all elements
+    lengths = [len(seq) for seq in sequences]
+    assert "numpy" in str(dtype), "dtype and input type does not match"
+    padded_seqs = np.zeros((len(sequences), max(lengths)) + extra_dims, dtype=dtype)
+    mask = np.zeros((len(sequences), max(lengths)), dtype=np.float32)
+
+    for idx, seq in enumerate(sequences):
+        end = lengths[idx]
+        padded_seqs[idx, :end] = seq
+        mask[idx, :end] = 1
+    return padded_seqs, mask
+
+
+def compute_temporal_iou_batch(preds, gt):
+    """ compute intersection-over-union along temporal axis
+    This function is significantly faster than `compute_temporal_iou`,
+    the result should be the same.
+    Args:
+        preds: np.ndarray, (N, 2), [st (float), ed (float)] * N
+        gt: [st (float), ed (float)]
+    Returns:
+        iou (float): np.ndarray, (N, )
+
+    References:
+        for np.divide with zeros, see https://stackoverflow.com/a/37977222
+    """
+    intersection = np.maximum(0, np.minimum(preds[:, 1], gt[1]) - np.maximum(preds[:, 0], gt[0]))
+    union = np.maximum(preds[:, 1], gt[1]) - np.minimum(preds[:, 0], gt[0])  # not the correct union though
+    return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0)
+
+
+def get_rounded_percentage(float_number, n_floats=2):
+    return round(float_number * 100, n_floats)
+
+
+TASK_TYPES = OrderedDict([
+    ("VCMR", "Video Corpus Moment Retrieval"),
+    ("SVMR", "Single Video Moment Retrieval"),
+    ("VR", "regular Video Retrieval")
+])
+
+
+def eval_by_task_type(moment_predictions, video2idx, ground_truth,
+                     iou_thds=(0.5, 0.7), recall_topks=(1, 5, 10, 100),
+                     task_type="SVMR", max_pred_per_query=100, match_number=True, verbose=True, use_desc_type=True):
+    """ a predicted triplet is positive only if:
+    1) its vid_name matches the GT vid_name
+    2) IoU between its timestamp and GT timestamp is higher than the given threshold
+
+    moment_predictions w.r.t. different task_type:
+        For each query, evaluated on top max_pred_per_query [vid_name, st, ed] triplets. (score entry ignored)
+        VCMR: vid_name might be repeating.
+        SVMR: vid_name is fixed to be the GT vid_name.
+        VR: vid_name is not repeating, st and ed will not be used.
+
+    Args:
+        video2idx: {vid_name (str): index (int), ...}
+        moment_predictions: list(dict), each dict is {
+            "desc": str,
+            "query_id": int,
+            "predictions": [vid_name_idx (int), st (float), ed (float), score (float)] * n_pred,
+                sorted predictions, n_pred could be different for all dicts. For each prediction,
+                only the first 3 elements [vid_name (str), st (float), ed (float),] are used,
+                any other following elements are ignored. We leave score here for record.
+        }
+        ground_truth: list(dict), each dict is {
+            "desc": str,
+            "query_id": int,
+            "type": str, one of [v, t, vt]
+            "vid_name": str
+            "ts": [st (float), ed (float)], or list([st (float), ed (float)]), len == 4.
+            ...
+        }
+        iou_thds: temporal IoU thresholds
+        recall_topks: recall at different top k
+        task_type: str, could be: ["VCMR", "SVMR", "VR"], see TASK_TYPES for definition.
+        max_pred_per_query: int, only top max_pred_per_query predictions for each query are used.
+        match_number: bool, must set to True if when do evaluation, False is only used for debug.
+        verbose:
+        use_desc_type: only TVR has desc type
+    Returns:
+
+    """
+    assert task_type in TASK_TYPES, "task_type must be one of {}".format(list(TASK_TYPES.keys()))
+    if verbose:
+        print("Running evaluation with task_type {}, n results {}; n gt {}"
+              .format(task_type, len(moment_predictions), len(ground_truth)))
+
+    predictions_by_query_id = {e["query_id"]: e for e in moment_predictions}
+    gt_by_query_id = {e["query_id"]: e for e in ground_truth}
+    desc_type2idx = {"v": 0, "t": 1, "vt": 2}
+    desc_types = []  # n_desc
+
+    if match_number:
+        assert set(gt_by_query_id.keys()) == set(predictions_by_query_id.keys()), \
+            "query_ids in predictions and ground_truth must match"
+    # assert len(set([len(e["predictions"]) for e in predictions_by_query_id.values()])) == 1, \
+    #     "all queries must have the same number of predictions"
+
+    pred_info_matrix_collection = []
+    for k, gt_item in tqdm(gt_by_query_id.items(), desc="Loop over moments", leave=False):
+        if not match_number and k not in predictions_by_query_id:
+            continue
+        pred_info_matrix = np.array(
+            [e[:3] for e in predictions_by_query_id[k]["predictions"]][:max_pred_per_query],
+            dtype=np.float32)  # (n_pred, 3)
+        if use_desc_type:
+            desc_types.append(desc_type2idx[gt_item["type"]])
+        vid_name_matched_pred = pred_info_matrix[:, 0] == video2idx[gt_item["vid_name"]]  # bool, (n_pred, )
+        pred_info_matrix = np.concatenate([pred_info_matrix, vid_name_matched_pred[:, None]], axis=1)  # (n_pred, 4)
+
+        # add 1 + len(iou_thds) columns, iou_scores, iou_corrects for each iou_thd.
+        iou_thd_corrects_columns = []
+        if len(gt_item["ts"]) >= 4:  # didemo, fro all 3 splits, at least 4 ts for each, < 0.5% has more than 4.
+            least_n_overlap = 2  # True if overlapped with at least least_n_overlap GT ts.
+            iou_corrects_dict = defaultdict(list)
+            for single_gt_ts in gt_item["ts"]:
+                single_gt_ts = np.array(single_gt_ts, dtype=np.float32)  # (2, )
+                # iou scores of the predictions that have wrong vid_name are set to 0.
+                iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
+                for iou_thd in iou_thds:
+                    iou_corrects_dict[iou_thd].append(iou_scores >= iou_thd)
+            for iou_thd in iou_thds:
+                iou_corrects = sum(iou_corrects_dict[iou_thd]) >= least_n_overlap  # bool, (n_pred, )
+                iou_thd_corrects_columns.append(iou_corrects[:, None])
+
+        else:  # should be 2, len([st, ed]) == 2
+            single_gt_ts = np.array(gt_item["ts"], dtype=np.float32)  # (2, )
+            # iou scores of the predictions that have wrong vid_name are set to 0.
+            iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
+
+            for iou_thd in iou_thds:
+                iou_corrects = iou_scores >= iou_thd  # bool, (n_pred, )
+                iou_thd_corrects_columns.append(iou_corrects[:, None])
+
+        pred_info_matrix = np.concatenate([pred_info_matrix, ] + iou_thd_corrects_columns, axis=1)  # (n_pred, 6)
+        pred_info_matrix_collection.append(pred_info_matrix)
+
+    # column header [vid_name_idx (int), st (float), ed (float), is_vid_name_match (bool),
+    # iou_scores>=iou_thd0 (bool), iou_scores>=iou_thd1 (bool)]
+    pred_info_matrix_collection = pad_sequences_1d_np(pred_info_matrix_collection)[0]  # (n_desc, n_pred, 6)
+    if use_desc_type:
+        desc_types = np.array(desc_types)  # (n_desc)
+
+    # results wrapper
+    metrics = OrderedDict()
+    metrics_by_type = OrderedDict()
+
+    iou_c_offset = 4  # iou_corrects column index starts here
+    if task_type == "VCMR":
+        for iou_idx, iou_thd in enumerate(iou_thds):
+            iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)  # (n_desc, n_pred)
+            # 1) there might be more than one positive clip, so use `>= 1`
+            for k in recall_topks:
+                metrics["{}-r{}".format(iou_thd, k)] = \
+                    get_rounded_percentage(np.mean(np.sum(iou_corrects[:, :k], axis=1) >= 1))
+        if use_desc_type:
+            for desc_type in desc_type2idx:
+                type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
+                n_desc_in_type = np.sum(type_corrects)  # (n_desc)
+                for iou_idx, iou_thd in enumerate(iou_thds):
+                    # (n_desc, n_pred)
+                    iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)
+                    for k in recall_topks:
+                        metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
+                            1.0 * np.sum(np.logical_and(np.sum(iou_corrects[:, :k], axis=1) >= 1, type_corrects))
+                            / n_desc_in_type
+                        )
+    elif task_type == "SVMR":
+        vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool)  # (n_desc, n_pred)
+        n_desc = len(vid_name_matched)
+        for iou_idx, iou_thd in enumerate(iou_thds):
+            iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)  # (n_desc, n_pred)
+            # 1) there might be more than one positive clip, so use `>= 1`
+            for k in recall_topks:
+                metrics["{}-r{}".format(iou_thd, k)] = get_rounded_percentage(np.mean(
+                    [np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 for idx in range(n_desc)]
+                ))
+        if use_desc_type:
+            for desc_type in desc_type2idx:
+                type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
+                n_desc_in_type = np.sum(type_corrects)  # (n_desc)
+                for iou_idx, iou_thd in enumerate(iou_thds):
+                    # (n_desc, n_pred)
+                    iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)
+                    # 1) there might be more than one positive clip, so use `>= 1`
+                    for k in recall_topks:
+                        metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
+                            1.0 * np.sum([np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 and type_corrects[idx]
+                                         for idx in range(n_desc)])
+                            / n_desc_in_type)
+
+    elif task_type == "VR":
+        vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool)  # (n_desc, n_pred)
+        for k in recall_topks:
+            metrics["r{}".format(k)] = \
+                get_rounded_percentage(np.mean(np.sum(vid_name_matched[:, :k], axis=1) >= 1))
+        if use_desc_type:
+            for desc_type in desc_type2idx:
+                type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
+                n_desc_in_type = np.sum(type_corrects)  # (n_desc)
+                for k in recall_topks:
+                    metrics_by_type["{}-r{}".format(desc_type, k)] = get_rounded_percentage(
+                        1.0 * np.sum(np.logical_and(np.sum(vid_name_matched[:, :k], axis=1) >= 1, type_corrects))
+                        / n_desc_in_type)
+    else:
+        raise ValueError("task_type wrong.")
+    if use_desc_type:
+        metrics_by_type["desc_type_ratio"] = "v {} t {} vt {}"\
+            .format(*[get_rounded_percentage(1.0 * np.sum(desc_types == desc_type2idx[k]) / len(desc_types))
+                      for k in ["v", "t", "vt"]])
+    return metrics, metrics_by_type
+
+
+def eval_retrieval(submission, ground_truth, iou_thds=(0.5, 0.7), verbose=True, match_number=True, use_desc_type=True):
+    video2idx = submission["video2idx"]
+    submitted_task_types = [k for k in TASK_TYPES if k in submission]
+    if verbose:
+        print("Evaluating for task {}".format(submitted_task_types))
+    eval_metrics = OrderedDict()
+    metrics_raw_dict = {}
+    for task_type in submitted_task_types:
+        metrics, metrics_by_type = eval_by_task_type(
+            submission[task_type], video2idx, ground_truth,
+            iou_thds=iou_thds, recall_topks=(1, 5, 10, 100),
+            task_type=task_type, max_pred_per_query=100,
+            match_number=match_number, verbose=verbose, use_desc_type=use_desc_type)
+        metrics_raw_dict[task_type] = metrics
+        metrics_raw_dict[task_type+"_by_type"] = metrics_by_type
+
+    for task_type in submitted_task_types:
+        eval_metrics[task_type] = metrics_raw_dict[task_type]
+    if use_desc_type:
+        for task_type in submitted_task_types:
+            eval_metrics[task_type+"_by_type"] = metrics_raw_dict[task_type+"_by_type"]
+    return eval_metrics
+
+
+def eval_main():
+    import argparse
+    parser = argparse.ArgumentParser(description="TVR Evaluation Script")
+    parser.add_argument("--submission_path", type=str, help="path to generated prediction file")
+    parser.add_argument("--gt_path", type=str, help="path to GT file")
+    parser.add_argument("--save_path", type=str, help="path to save the results")
+    parser.add_argument("--not_verbose", action="store_true")
+    args = parser.parse_args()
+
+    verbose = not args.not_verbose
+    submission = load_json(args.submission_path)
+    gt = load_jsonl(args.gt_path)
+    results = eval_retrieval(submission, gt, iou_thds=(0.5, 0.7), verbose=verbose)
+    if verbose:
+        print(json.dumps(results, indent=4))
+
+    with open(args.save_path, "w") as f:
+        f.write(json.dumps(results, indent=4))
+
+
+if __name__ == '__main__':
+    eval_main()
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4c6d7a5325ca3e7459c9e220cddac31de951154
--- /dev/null
+++ b/train.py
@@ -0,0 +1,246 @@
+import os
+import time
+import json
+import pprint
+import random
+import numpy as np
+from tqdm import tqdm, trange
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from config.config import BaseOptions
+from model.conquer import CONQUER
+from data_loader.second_stage_start_end_dataset import StartEndDataset
+from inference import eval_epoch
+from optim.adamw import AdamW
+from utils.basic_utils import TimeTracker, load_config, save_json, get_logger   
+from utils.model_utils import count_parameters, move_cuda, start_end_collate
+
+
+
+def set_seed(seed, use_cuda=True):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if use_cuda:
+        torch.cuda.manual_seed_all(seed)
+
+
+
+def rm_key_from_odict(odict_obj, rm_suffix):
+    """remove key entry from the OrderedDict"""
+    return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k])
+
+
+def build_optimizer(model, opts):
+    # Prepare optimizer
+    param_optimizer = [(n, p) for n, p in model.named_parameters()
+                       if (n.startswith('encoder') or n.startswith('query_weight')) and p.requires_grad ]
+
+    param_top = [(n, p) for n, p in model.named_parameters()
+                 if  ( not n.startswith('encoder') and not n.startswith('query_weight'))  and p.requires_grad]
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_top
+                    if not any(nd in n for nd in no_decay)],
+            'weight_decay': opts.wd},
+        {'params': [p for n, p in param_top
+                    if any(nd in n for nd in no_decay)],
+            'weight_decay': 0.0},
+        {'params': [p for n, p in param_optimizer
+                    if not any(nd in n for nd in no_decay)],
+            'lr': opts.lr_mul * opts.lr,
+            'weight_decay': opts.wd},
+        {'params': [p for n, p in param_optimizer
+                    if any(nd in n for nd in no_decay)],
+            'lr': opts.lr_mul * opts.lr,
+            'weight_decay': 0.0}
+    ]
+
+    # currently Adam only
+    optimizer = AdamW(optimizer_grouped_parameters,
+                         lr=opts.lr)
+    return optimizer
+
+
+def train(model, train_data, val_data, test_data, opt, logger):
+    # Prepare optimizer
+    if opt.device.type == "cuda":
+        model.to(opt.device)
+        logger.info("CUDA enabled.")
+        assert len(opt.device_ids) == 1
+
+    train_loader = DataLoader(train_data,
+                              collate_fn=start_end_collate,
+                              batch_size=opt.bsz,
+                              num_workers=opt.num_workers,
+                              shuffle=True,
+                              pin_memory=True,
+                              drop_last=True)
+
+    # Prepare optimizer
+    optimizer = build_optimizer(model, opt)
+    thresholds = [0.3, 0.5, 0.7]
+    topks = [10, 20, 40]
+    best_val_ndcg = 0
+    eval_step = len(train_loader) // opt.eval_num_per_epoch
+    
+    time_tracker = TimeTracker()
+    for epoch_i in range(0, opt.n_epoch):
+        print(f"TRAIN EPOCH: {epoch_i}|{opt.n_epoch}")
+        
+        num_training_examples = len(train_loader)
+        time_tracker.start("grab_data")
+
+        for batch_idx, batch in tqdm(enumerate(train_loader), desc=f"Training {epoch_i}|{opt.n_epoch}", total=num_training_examples):
+            global_step = epoch_i * num_training_examples + batch_idx
+            time_tracker.stop("grab_data")
+            time_tracker.start("to_device")
+            model.train()
+            model_inputs = move_cuda(batch["model_inputs"], opt.device)
+            time_tracker.stop("to_device")
+            time_tracker.start("forward")
+            optimizer.zero_grad()
+
+            loss, loss_dict = model(model_inputs)
+            time_tracker.stop("forward")
+            time_tracker.start("backward")
+
+            loss.backward()
+            if opt.grad_clip != -1:
+                nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
+            optimizer.step()
+            
+            time_tracker.stop("backward")
+            time_tracker.start("grab_data")
+            
+            if global_step % 10 == 0:
+                print(time_tracker.report())
+                time_tracker.reset_all()
+                for i in range(torch.cuda.device_count()):
+                    print(f"Memory Allocated on GPU {i}: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
+                    print(f"Memory Cached on GPU {i}: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
+                print("-------------------------")
+                
+            ###### ------------------- #############
+            ### eval during training
+            if global_step % eval_step == 0 and global_step != 0:
+                model.eval()
+                
+                val_performance, val_predictions = eval_epoch(model, val_data, opt,  max_after_nms=40, iou_thds=thresholds, topks=topks)
+                test_performance, test_predictions = eval_epoch(model, test_data, opt,  max_after_nms=40, iou_thds=thresholds, topks=topks)
+
+                logger.info(f"EPOCH: {epoch_i}")
+                line1 = ""
+                line2 = "VAL: "
+                line3 = "TEST: "
+                for K, vs in val_performance.items():
+                    for T, v in vs.items():
+                        line1 += f"NDCG@{K}, IoU={T}\t"
+                        line2 += f" {v:.6f}"
+                        
+                for K, vs in test_performance.items():
+                    for T, v in vs.items():
+                        line3 += f" {v:.6f}"
+                logger.info(line1)
+                logger.info(line2)
+                logger.info(line3)
+            
+                anchor_ndcg = val_performance[20][0.5]
+                if anchor_ndcg > best_val_ndcg:
+                    print("~"*40)
+                    save_json(val_predictions, os.path.join(opt.results_dir, "best_val_predictions.json"))
+                    save_json(test_predictions, os.path.join(opt.results_dir, "best_test_predictions.json"))
+                    best_val_ndcg = anchor_ndcg
+                    logger.info("BEST " + line2)
+                    logger.info("BEST " + line3)
+                    checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i}
+                    torch.save(checkpoint, opt.ckpt_filepath)
+                    logger.info("save checkpoint: {}".format(opt.ckpt_filepath))
+                    print("~"*40)
+
+                logger.info("")
+
+
+def start_training():
+    opt = BaseOptions().parse()
+    logger = get_logger(opt.results_dir, opt.model_name +"_"+ opt.exp_id)
+    set_seed(opt.seed)
+    opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
+    opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"
+
+
+    data_config = load_config(opt.dataset_config)
+
+
+    train_dataset = StartEndDataset(
+        config=data_config,
+        data_path = data_config.train_data_path,
+        vr_rank_path = data_config.train_first_VR_ranklist_path, 
+        mode="train",
+        data_ratio=opt.data_ratio,
+        neg_video_num=opt.neg_video_num,
+        use_extend_pool=opt.use_extend_pool,
+    )
+
+    val_dataset = StartEndDataset(
+        config = data_config,
+        data_path = data_config.val_data_path,
+        vr_rank_path = data_config.val_first_VR_ranklist_path_hero, 
+        mode="val",
+        max_ctx_len=opt.max_ctx_len,
+        max_desc_len=opt.max_desc_len,
+        clip_length=opt.clip_length,
+        ctx_mode = opt.ctx_mode,
+        data_ratio = opt.data_ratio,
+        is_eval = True,
+        inference_top_k = opt.max_vcmr_video,
+    )
+
+    test_dataset = StartEndDataset(
+        config = data_config,
+        data_path = data_config.test_data_path,
+        vr_rank_path = data_config.test_first_VR_ranklist_path_hero, 
+        mode="val",
+        max_ctx_len=opt.max_ctx_len,
+        max_desc_len=opt.max_desc_len,
+        clip_length=opt.clip_length,
+        ctx_mode = opt.ctx_mode,
+        data_ratio = opt.data_ratio,
+        is_eval = True,
+        inference_top_k = opt.max_vcmr_video,
+    )
+
+
+    model_config = load_config(opt.model_config)
+
+    logger.info("model_config {}".format(pprint.pformat(model_config,indent=4)))
+
+    model = CONQUER(
+         model_config,
+         visual_dim = opt.visual_dim,
+         text_dim =opt.text_dim,
+         query_dim = opt.query_dim,
+         hidden_dim = opt.hidden_dim,
+         video_len= opt.max_ctx_len,
+         ctx_mode = opt.ctx_mode,
+         lw_video_ce = opt.lw_video_ce,  # video cross-entropy loss weight
+         lw_st_ed = opt.lw_st_ed, # moment cross-entropy loss weight
+         similarity_measure=opt.similarity_measure,
+         use_debug = opt.debug,
+         no_output_moe_weight = opt.no_output_moe_weight)
+
+    count_parameters(model)
+
+    logger.info("Start Training...")
+    train(model, train_dataset, val_dataset, test_dataset, opt, logger)
+
+
+if __name__ == '__main__':
+    start_training()
+
diff --git a/unused/convert_h5_lmdb.py b/unused/convert_h5_lmdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..56318222b518f945accdce7893357fa2e0050e73
--- /dev/null
+++ b/unused/convert_h5_lmdb.py
@@ -0,0 +1,27 @@
+import h5py
+import lmdb
+import numpy as np
+from tqdm import tqdm
+
+h5_path = "data/h5/features/resnet_slowfast_1.5.h5"
+lmdb_path = "data/features/resnet_slowfast_1.5"
+
+h5_data  = h5py.File(h5_path, 'r') 
+env = lmdb.open(lmdb_path, readonly=False, create=True, max_dbs=0, map_size=1 * 1024**3)
+
+# Open or create the LMDB database
+n = 0
+with env.begin(write=True) as txn:
+    # Iterate over items in the HDF5 file
+    for key in tqdm(h5_data.keys()):
+        print(key)
+        # Read the feature array for the current key
+        feature = h5_data[key][:]
+        buffer = np.getbuffer(feature)
+        txn.put(key.encode(), buffer)
+
+        n += 1
+        if n > 10:
+            break
+print("Conversion completed.")
+
diff --git a/unused/convert_lmdb_h5.py b/unused/convert_lmdb_h5.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b5275c2a021962866b7a535f684e5090a49bf47
--- /dev/null
+++ b/unused/convert_lmdb_h5.py
@@ -0,0 +1,57 @@
+import h5py
+import lmdb
+import numpy as np
+from tqdm import tqdm
+import io
+import msgpack_numpy
+import msgpack
+
+
+lmdb_path = "data/TVR_Ranking_val_top100_hero"
+h5_path = "data/h5/TVR_Ranking_val_top100_hero.h5"
+# Open the LMDB environment
+env = lmdb.open(lmdb_path, readonly=True, max_dbs=0, max_readers=4096 * 8, readahead=False)
+
+h5_data = h5py.File(h5_path, 'w')
+with env.begin(write=False, buffers=True) as txn:
+    cursor = txn.cursor()
+    keys = list(cursor.iternext(values=False))  # List of keys for progress tracking
+    for key in tqdm(keys, desc="Processing LMDB to HDF5"):
+        key_str = bytes(key).decode()
+        value = cursor.get(key)
+        _external_inference_vr_res = msgpack.loads(value)
+        h5_data.create_dataset(key_str, data=_external_inference_vr_res)
+print("Conversion completed.")
+h5_data.close()
+
+# lmdb_path = "data/features/resnet_slowfast_1.5"
+# h5_path = "data/h5/features/resnet_slowfast_1.5.h5"
+# env = lmdb.open(lmdb_path, readonly=True, max_dbs=0, max_readers=4096 * 8, readahead=False)
+# h5_data = h5py.File(h5_path, 'w')
+# with env.begin(write=False, buffers=True) as txn:
+#     cursor = txn.cursor()
+#     keys = list(cursor.iternext(values=False))  # List of keys for progress tracking
+#     for key in tqdm(keys, desc="Processing LMDB to HDF5"):
+#         key_str = bytes(key).decode()
+#         value = cursor.get(key)
+#         img_dump = {k: np.copy(v) for k, v in msgpack_numpy.loads(value, raw=False).items()}
+#         visual_feat = img_dump['features']  # Adjust if needed, like [:self.max_ctx_len]
+#         h5_data.create_dataset(key_str, data=visual_feat)
+# print("Conversion completed.")
+# h5_data.close()
+
+
+# lmdb_path = "data/features/tvr_sub_pretrained_w_sub_query_max_cl-1.5"
+# h5_path = "data/h5/features/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5"
+# env = lmdb.open(lmdb_path, readonly=True, max_dbs=0, max_readers=4096 * 8, readahead=False)
+# h5_data = h5py.File(h5_path, 'w')
+# with env.begin(write=False, buffers=True) as txn:
+#     cursor = txn.cursor()
+#     for key, value in tqdm(cursor):
+#         key_str = bytes(key).decode()
+#         with io.BytesIO(value) as reader:
+#             feat_dump = np.load(reader, allow_pickle=True)
+#             sub_feat = feat_dump["features"]
+#             h5_data.create_dataset(key_str, data=sub_feat)
+# print("Conversion completed.")
+# h5_data.close()
diff --git a/unused/find_best_epoch.py b/unused/find_best_epoch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3499a9c0431753c79d1815b947bcb46e444cbe0
--- /dev/null
+++ b/unused/find_best_epoch.py
@@ -0,0 +1,21 @@
+def rewrite_epoch(filename, new_file_name):
+    max_value = float(-100)
+    new_file = []
+    
+    with open(filename, 'r') as file:
+        for line in file:
+            new_file.append(line)
+            if line.startswith("INFO:VAL"):
+                anchor = float(line.split()[5]) # Assuming the value is at the 5th index
+                if anchor > max_value:
+                    max_value = anchor
+                    print(max_value)
+                    new_file.append("BEST: " + line)
+    
+    with open(new_file_name, 'w') as file:
+        file.writelines(new_file)
+
+# Example usage
+filename = "results/tvr-top40-2024_07_11_10_58_46/20240711_105847_conquer_top40.log"
+new_file_name = "new.log"
+best_epoch = rewrite_epoch(filename, new_file_name)
diff --git a/unused/run_exclusive.sh b/unused/run_exclusive.sh
new file mode 100644
index 0000000000000000000000000000000000000000..94466530fe9818ead1069d8a38c1f0abaebb4be8
--- /dev/null
+++ b/unused/run_exclusive.sh
@@ -0,0 +1,13 @@
+CUDA_VISIBLE_DEVICES=0 python train.py \
+    --dataset_config config/tvr_data_config.json \
+    --model_config config/model_config.json \
+    --eval_tasks_at_training VCMR \
+    --use_interal_vr_scores \
+    --use_extend_pool 500 \
+    --neg_video_num 0 \
+    --exp_id debug \
+    --max_vcmr_video 10 \
+    --similarity_measure exclusive
+    # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8
+    # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_exclusive.sh
+
diff --git a/unused/run_general.sh b/unused/run_general.sh
new file mode 100644
index 0000000000000000000000000000000000000000..464fe5cca75e41afb1e171f4d637d2fa266fa38b
--- /dev/null
+++ b/unused/run_general.sh
@@ -0,0 +1,15 @@
+CUDA_VISIBLE_DEVICES=0 python train.py \
+    --dataset_config config/tvrranking_data_config.json \
+    --use_interal_vr_scores \
+    --model_config config/model_config.json \
+    --eval_tasks_at_training VCMR VR \
+    --bsz 64 \
+    --use_extend_pool 500 \
+    --neg_video_num 0 \
+    --exp_id reproduce \
+    --max_vcmr_video 2 \
+    --similarity_measure general \
+    --eval_num_per_epoch 1
+    # qsub -I -l select=1:ngpus=1 -P gs_slab -q slab_gpu8
+    # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run_general.sh
+
diff --git a/unused/run_top01.sh b/unused/run_top01.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f5ff950a11cb1d5d2eaddfa08d7f526b38e667c1
--- /dev/null
+++ b/unused/run_top01.sh
@@ -0,0 +1,27 @@
+#  CUDA_VISIBLE_DEVICES=0 \
+python method_tvr/train.py \
+    --model_name ReLoCLNet \
+    --model_config_path ./configs/ReLoCLNet.yaml \
+    --dset_name TVR-Ranking \
+    --eval_split_name val \
+    --nms_thd -1 \
+    --results_root results/tvr_ranking \
+    --train_path data/TVR_Ranking/train_top01.jsonl \
+    --val_path data/TVR_Ranking/val.jsonl \
+    --test_path data/TVR_Ranking/test.jsonl \
+    --clip_length 1.5 \
+    --vid_feat_size 1024 \
+    --ctx_mode video_sub_tef \
+    --no_norm_vfeat \
+    --max_pred_l 16\
+    --sub_feat_size 768\
+    --video_duration_idx_path ./data/common_data/video_corpus.json \
+    --desc_bert_path ./data/common_data/query_bert.h5 \
+    --vid_feat_path /home/share/czzhang/Dataset/TVR/TVR_feature/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 \
+    --sub_bert_path /home/share/czzhang/Dataset/TVR/TVR_feature/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5\
+    --eval_tasks_at_training VCMR \
+    --eval_num_per_epoch 0.1 \
+    --n_epoch 1000 \
+    --exp_id top01
+    # qsub -I -l select=1:ngpus=1 -P gs_slab -q slab_gpu8
+    # cd 11_TVR-Ranking/ReLoCLNet/; conda activate py11; sh run_top01.sh 
\ No newline at end of file
diff --git a/unused/run_tvrranking.sh b/unused/run_tvrranking.sh
new file mode 100644
index 0000000000000000000000000000000000000000..952ae3f7ff7d13d3b1d343d8871a673fc1ea15f3
--- /dev/null
+++ b/unused/run_tvrranking.sh
@@ -0,0 +1,15 @@
+CUDA_VISIBLE_DEVICES=0 python train.py \
+    --dataset_config config/tvrranking_data_config.json \
+    --model_config config/model_config.json \
+    --eval_tasks_at_training VCMR \
+    --use_interal_vr_scores \
+    --bsz 64 \
+    --use_extend_pool 500 \
+    --neg_video_num 0 \
+    --exp_id debug \
+    --bsz 5 \
+    --max_vcmr_video 10 \
+    --num_workers 4
+
+    # qsub -I -l select=1:ngpus=1 -P gs_slab -q gpu8
+    # cd 11_TVR-Ranking/CONQUER/; conda activate py11; sh run.sh
\ No newline at end of file
diff --git a/unused/select_conquer_dataset.py b/unused/select_conquer_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..45369cd87ea1e5d29d1f7af609976f1383de8099
--- /dev/null
+++ b/unused/select_conquer_dataset.py
@@ -0,0 +1,49 @@
+import h5py
+import lmdb
+import numpy as np
+import msgpack
+from utils.basic_utils import load_json, save_json
+from tqdm import tqdm
+import os
+
+data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/train_top40.json"
+# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/val.json"
+# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/test.json"
+old_data = load_json(data_path)
+
+new_data_path = "./data/TVR_Ranking_CONQUER/train_top40.json"
+# new_data_path = "./data/TVR_Ranking_CONQUER/val.json"
+# new_data_path = "./data/TVR_Ranking_CONQUER/test.json"
+new_vr_path = "data/TVR_Ranking_train_top100_hero"
+# new_vr_path = "data/TVR_Ranking_val_top100_hero"
+# new_vr_path = "data/TVR_Ranking_test_top100_hero"
+
+# Destination LMDB path (for writing)
+
+os.makedirs(new_vr_path, exist_ok=True)
+
+consolidated_path = "/home/renjie.liang/datasets/tvr_feature_release/data/consolidated_vr_results"
+vr_pool = lmdb.open(consolidated_path, readonly=True, create=False,  max_readers=4096 * 8, readahead=False)
+vr_txn = vr_pool.begin(buffers=True)
+    
+# Open the new LMDB for writing
+new_vr_pool = lmdb.open(new_vr_path, readonly=False, create=True, max_dbs=0, map_size=10 * 1024**3)  # 10 GiB
+clean_data = []
+with new_vr_pool.begin(write=True) as new_vr_txn:
+    for i in tqdm(old_data):
+        query_id = i["query_id"]
+        # Retrieve the data from the source database
+        vr_data = vr_txn.get(str(query_id).encode())
+        if vr_data is not None:
+            clean_data.append(i)
+            # Data exists, so load it using msgpack and then put it into the new database
+            vr_res = msgpack.loads(vr_data)
+            # Ensure the data is serialized before storing
+            vr_data_serialized = msgpack.dumps(vr_res)
+            new_vr_txn.put(str(query_id).encode(), vr_data_serialized)
+
+# Close both the source and destination databases after operations are complete
+save_json(clean_data, new_data_path)
+print(len(old_data), "->", len(clean_data))
+vr_pool.close()
+new_vr_pool.close()
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utils/basic_utils.py b/utils/basic_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..68666749ee3253d8f14e1ecc8a2e8991d72e941d
--- /dev/null
+++ b/utils/basic_utils.py
@@ -0,0 +1,242 @@
+import os
+import json
+import zipfile
+import numpy as np
+import pickle
+from easydict import EasyDict
+
+def load_config(config_json_file) -> EasyDict:
+    with open(config_json_file,
+              "r", encoding='utf-8') as reader:
+        config = json.loads(reader.read())
+    cfg = EasyDict(config)
+
+    return cfg
+
+
+def load_pickle(filename):
+    with open(filename, "rb") as f:
+        return pickle.load(f)
+
+
+def save_pickle(data, filename):
+    with open(filename, "wb") as f:
+        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+def load_json(filename):
+    with open(filename, "r") as f:
+        return json.load(f)
+
+
+def save_json(data, filename, save_pretty=False, sort_keys=False):
+    with open(filename, "w") as f:
+        if save_pretty:
+            f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
+        else:
+            json.dump(data, f)
+
+
+def load_jsonl(filename):
+    with open(filename, "r") as f:
+        return [json.loads(l.strip("\n")) for l in f.readlines()]
+
+
+def save_jsonl(data, filename):
+    """data is a list"""
+    with open(filename, "w") as f:
+        f.write("\n".join([json.dumps(e) for e in data]))
+
+
+def save_lines(list_of_str, filepath):
+    with open(filepath, "w") as f:
+        f.write("\n".join(list_of_str))
+
+
+def read_lines(filepath):
+    with open(filepath, "r") as f:
+        return [e.strip("\n") for e in f.readlines()]
+
+
+def mkdirp(p):
+    if not os.path.exists(p):
+        os.makedirs(p)
+
+
+def flat_list_of_lists(l):
+    """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]"""
+    return [item for sublist in l for item in sublist]
+
+
+def convert_to_seconds(hms_time):
+    """ convert '00:01:12' to 72 seconds.
+    :hms_time (str): time in comma separated string, e.g. '00:01:12'
+    :return (int): time in seconds, e.g. 72
+    """
+    times = [float(t) for t in hms_time.split(":")]
+    return times[0] * 3600 + times[1] * 60 + times[2]
+
+
+def get_video_name_from_url(url):
+    return url.split("/")[-1][:-4]
+
+
+def merge_dicts(list_dicts):
+    merged_dict = list_dicts[0].copy()
+    for i in range(1, len(list_dicts)):
+        merged_dict.update(list_dicts[i])
+    return merged_dict
+
+
+def l2_normalize_np_array(np_array, eps=1e-5):
+    """np_array: np.ndarray, (*, D), where the last dim will be normalized"""
+    return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps)
+
+
+def make_zipfile(src_dir, save_path, enclosing_dir="", exclude_dirs=None, exclude_extensions=None,
+                 exclude_dirs_substring=None):
+    """make a zip file of root_dir, save it to save_path.
+    exclude_paths will be excluded if it is a subdir of root_dir.
+    An enclosing_dir is added is specified.
+    """
+    abs_src = os.path.abspath(src_dir)
+    with zipfile.ZipFile(save_path, "w") as zf:
+        for dirname, subdirs, files in os.walk(src_dir):
+            if exclude_dirs is not None:
+                for e_p in exclude_dirs:
+                    if e_p in subdirs:
+                        subdirs.remove(e_p)
+            if exclude_dirs_substring is not None:
+                to_rm = []
+                for d in subdirs:
+                    if exclude_dirs_substring in d:
+                        to_rm.append(d)
+                for e in to_rm:
+                    subdirs.remove(e)
+            arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:])
+            zf.write(dirname, arcname)
+            for filename in files:
+                if exclude_extensions is not None:
+                    if os.path.splitext(filename)[1] in exclude_extensions:
+                        continue  # do not zip it
+                absname = os.path.join(dirname, filename)
+                arcname = os.path.join(enclosing_dir, absname[len(abs_src) + 1:])
+                zf.write(absname, arcname)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current/max/min value"""
+    def __init__(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.max = -1e10
+        self.min = 1e10
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.max = -1e10
+        self.min = 1e10
+
+    def update(self, val, n=1):
+        self.max = max(val, self.max)
+        self.min = min(val, self.min)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True):
+    """Dissect an array (N, D) into a list a sub-array,
+    np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept"""
+    if assert_equal:
+        assert len(np_array) == sum(lengths)
+    length_indices = [0, ]
+    for i in range(len(lengths)):
+        length_indices.append(length_indices[i] + lengths[i])
+    if dim == 0:
+        array_list = [np_array[length_indices[i]:length_indices[i+1]] for i in range(len(lengths))]
+    elif dim == 1:
+        array_list = [np_array[:, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
+    elif dim == 2:
+        array_list = [np_array[:, :, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
+    else:
+        raise NotImplementedError
+    return array_list
+
+
+def get_ratio_from_counter(counter_obj, threshold=200):
+    keys = counter_obj.keys()
+    values = counter_obj.values()
+    filtered_values = [counter_obj[k] for k in keys if k > threshold]
+    return float(sum(filtered_values)) / sum(values)
+
+
+
+
+import time
+import logging
+import os
+
+def get_logger(dir, tile):
+    os.makedirs(dir, exist_ok=True)
+    log_file = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+    log_file = os.path.join(dir, "{}_{}.log".format(log_file, tile))
+
+    logger = logging.getLogger()
+    logger.setLevel('DEBUG')
+    BASIC_FORMAT = "%(levelname)s:%(message)s"
+    # DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
+    formatter = logging.Formatter(BASIC_FORMAT)
+    chlr = logging.StreamHandler()
+    chlr.setFormatter(formatter)
+
+    fhlr = logging.FileHandler(log_file) 
+    fhlr.setFormatter(formatter)
+    fhlr.setLevel('INFO') 
+
+    logger.addHandler(chlr)
+    logger.addHandler(fhlr)
+    return logger
+
+
+
+
+
+
+        
+class TimeTracker:
+    def __init__(self):
+        self.times = {}
+        self.start_times = {}
+
+    def start(self, name):
+        self.start_times[name] = time.time()
+
+    def stop(self, name):
+        if name not in self.times:
+            self.times[name] = 0
+        if name in self.start_times:
+            self.times[name] += time.time() - self.start_times[name]
+            del self.start_times[name]
+
+    def get_time(self, name):
+        return self.times.get(name, 0)
+
+    def reset(self, name):
+        if name in self.times:
+            self.times[name] = 0
+
+    def reset_all(self):
+        self.times = {}
+        self.start_times = {}
+
+    def report(self):
+        report = "\n".join([f"{name}: {time:.4f} seconds" for name, time in self.times.items()])
+        return report
diff --git a/utils/inference_utils.py b/utils/inference_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b80d8f1ff3bf9148ec30f037ff1c416704dc822
--- /dev/null
+++ b/utils/inference_utils.py
@@ -0,0 +1,76 @@
+from utils.temporal_nms import temporal_non_maximum_suppression
+from collections import defaultdict
+
+
+def get_submission_top_n(submission, top_n=100):
+    def get_prediction_top_n(list_dict_predictions, top_n):
+        top_n_res = []
+        for e in list_dict_predictions:
+            e["predictions"] = e["predictions"][:top_n]
+            top_n_res.append(e)
+        return top_n_res
+
+    top_n_submission = dict(video2idx=submission["video2idx"], )
+    for k in submission:
+        if k != "video2idx":
+            top_n_submission[k] = get_prediction_top_n(submission[k], top_n)
+    return top_n_submission
+
+
+
+def post_processing_vcmr_nms(vcmr_res, nms_thd=0.6, max_before_nms=1000, max_after_nms=100):
+    """
+    vcmr_res: list(dict), each dict is{
+        "desc": str,
+        "query_id": int,
+        "predictions": list(sublist)  # each sublist is
+            [video_idx (int), st (float), ed(float), score (float)], video_idx could be different
+    }
+    """
+    processed_vcmr_res = []
+    for e in vcmr_res:
+        e["predictions"] = filter_vcmr_by_nms(e["predictions"],
+                                              nms_threshold=nms_thd,
+                                              max_before_nms=max_before_nms,
+                                              max_after_nms=max_after_nms)
+        processed_vcmr_res.append(e)
+    return processed_vcmr_res
+
+
+def filter_vcmr_by_nms(all_video_predictions, nms_threshold=0.6,
+                       max_before_nms=1000, max_after_nms=100, score_col_idx=3):
+    """ Apply non-maximum suppression for all the predictions for each video.
+    1) group predictions by video index
+    2) apply nms individually for each video index group
+    3) combine and sort the predictions
+    Args:
+        all_video_predictions: list(sublist),
+            Each sublist is [video_idx (int), st (float), ed(float), score (float)]
+            Note the scores are negative distances.
+        nms_threshold: float
+        max_before_nms: int
+        max_after_nms: int
+        score_col_idx: int
+    Returns:
+
+    """
+    predictions_neg_by_video_group = defaultdict(list)
+    for pred in all_video_predictions[:max_before_nms]:
+        predictions_neg_by_video_group[pred[0]].append(pred[1:])  # [st (float), ed(float), score (float)]
+
+    predictions_by_video_group_neg_after_nms = dict()
+    for video_idx, grouped_preds in predictions_neg_by_video_group.items():
+        predictions_by_video_group_neg_after_nms[video_idx] = \
+            temporal_non_maximum_suppression(grouped_preds, nms_threshold=nms_threshold)
+
+    predictions_after_nms = []
+    for video_idx, grouped_preds in predictions_by_video_group_neg_after_nms.items():
+        for pred in grouped_preds:
+            pred = [video_idx] + pred  # [video_idx (int), st (float), ed(float), score (float)]
+            predictions_after_nms.append(pred)
+
+    # ranking happens across videos
+    predictions_after_nms = sorted(predictions_after_nms,
+                                   key=lambda x: x[score_col_idx],
+                                   reverse=True)[:max_after_nms]  # descending order
+    return predictions_after_nms
\ No newline at end of file
diff --git a/utils/mk_video_split_with_duration.py b/utils/mk_video_split_with_duration.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab5a524174febeb4515e511dc33c10a74c212d84
--- /dev/null
+++ b/utils/mk_video_split_with_duration.py
@@ -0,0 +1,18 @@
+from utils.basic_utils import load_json, save_json
+
+
+def combine(video_name_split_path, video_duration_path, save_path):
+    video_name_split = load_json(video_name_split_path)
+    video_duration_dict = load_json(video_duration_path)
+
+    combined_dict = {}
+    for split_name, split_video_names in video_name_split.items():
+        combined_dict[split_name] = {vid_name: video_duration_dict[vid_name]
+                                     for vid_name in split_video_names}
+    save_json(combined_dict, save_path)
+
+
+if __name__ == '__main__':
+    import sys
+    combine(*sys.argv[1:])
+
diff --git a/utils/model_utils.py b/utils/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2dead4ace10a865bd310eee3b1cbe953d3b3ddd
--- /dev/null
+++ b/utils/model_utils.py
@@ -0,0 +1,68 @@
+__author__ = "Jie Lei"
+
+#  ref: https://github.com/lichengunc/MAttNet/blob/master/lib/layers/lang_encoder.py#L11
+#  ref: https://github.com/easonnie/flint/blob/master/torch_util.py#L272
+import torch
+from torch.utils.data.dataloader import  default_collate
+
+VERY_NEGATIVE_NUMBER = -1e10
+VERY_POSITIVE_NUMBER = 1e10
+
+def count_parameters(model, verbose=True):
+    """Count number of parameters in PyTorch model,
+    References: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7.
+
+    from utils.utils import count_parameters
+    count_parameters(model)
+    import sys
+    sys.exit(1)
+    """
+    n_all = sum(p.numel() for p in model.parameters())
+    n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    if verbose:
+        print("Parameter Count: all {:,d}; trainable {:,d}".format(n_all, n_trainable))
+    return n_all, n_trainable
+
+def mask_logits(target, mask):
+    return target * mask + (1 - mask) * VERY_NEGATIVE_NUMBER
+
+def move_cuda(batch,device):
+    # move to cuda
+    for key, value in batch.items():
+        if isinstance(value, dict):
+            for _key, _value in value.items():
+                batch[key][_key] = _value.cuda(non_blocking=True, device=device)
+        elif isinstance(value, (list,)):
+            for i in range(len(value)):
+                batch[key][i] = value[i].cuda(non_blocking=True, device=device)
+        else:
+            batch[key] = value.cuda(non_blocking=True, device=device)
+
+    return batch
+
+def start_end_collate(batch):
+    batch_meta = [e["meta"] for e in batch]  # no need to collate
+
+    batched_data = default_collate([e["model_inputs"] for e in batch])
+    return {"meta":batch_meta, "model_inputs":batched_data}
+
+
+# def vsmr_start_end_collate(batch):
+#     batch_meta = [e["meta"] for e in batch]  # no need to collate
+#
+#     batched_data = dict()
+#     sample_batch_data = batch[0]["model_inputs"]
+#
+#     for key in ["visual", "sub"]:
+#         if key in sample_batch_data.keys():
+#             batched_data[key] = dict()
+#             for key_2 in ["feat","feat_mask","feat_pos_id","feat_token_id"]:
+#                 batched_data[key][key_2] = torch.cat(tuple(e["model_inputs"][key][key_2] for e in batch),dim=0)
+#
+#
+#     for key in ["query", "st_ed_indices" ]:
+#         if key in sample_batch_data.keys():
+#             batched_data[key] = default_collate([e["model_inputs"][key] for e in batch])
+#
+#     return {"meta":batch_meta, "model_inputs":batched_data}
+
diff --git a/utils/temporal_nms.py b/utils/temporal_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..545ed8045d7da4a6a831395029e39c0f803025d5
--- /dev/null
+++ b/utils/temporal_nms.py
@@ -0,0 +1,74 @@
+"""
+Non-Maximum Suppression for video proposals.
+"""
+
+
+def compute_temporal_iou(pred, gt):
+    """ deprecated due to performance concerns
+    compute intersection-over-union along temporal axis
+    Args:
+        pred: [st (float), ed (float)]
+        gt: [st (float), ed (float)]
+    Returns:
+        iou (float):
+
+    Ref: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py
+    """
+    intersection = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0]))
+    union = max(pred[1], gt[1]) - min(pred[0], gt[0])  # not the correct union though
+    if union == 0:
+        return 0
+    else:
+        return 1.0 * intersection / union
+
+
+def temporal_non_maximum_suppression(predictions, nms_threshold, max_after_nms=100):
+    """
+    Args:
+        predictions: list(sublist), each sublist is [st (float), ed(float), score (float)],
+            note larger scores are better and are preserved. For metrics that are better when smaller,
+            please convert to its negative, e.g., convert distance to negative distance.
+        nms_threshold: float in [0, 1]
+        max_after_nms:
+    Returns:
+        predictions_after_nms: list(sublist), each sublist is [st (float), ed(float), score (float)]
+    References:
+        https://github.com/wzmsltw/BSN-boundary-sensitive-network/blob/7b101fc5978802aa3c95ba5779eb54151c6173c6/Post_processing.py#L42
+    """
+    if len(predictions) == 1:  # only has one prediction, no need for nms
+        return predictions
+
+    predictions = sorted(predictions, key=lambda x: x[2], reverse=True)  # descending order
+
+    tstart = [e[0] for e in predictions]
+    tend = [e[1] for e in predictions]
+    tscore = [e[2] for e in predictions]
+    rstart = []
+    rend = []
+    rscore = []
+    while len(tstart) > 1 and len(rscore) < max_after_nms:  # max 100 after nms
+        idx = 1
+        while idx < len(tstart):  # compare with every prediction in the list.
+            if compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]) > nms_threshold:
+                # rm highly overlapped lower score entries.
+                tstart.pop(idx)
+                tend.pop(idx)
+                tscore.pop(idx)
+                # print("--------------------------------")
+                # print(compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]))
+                # print([tstart[0], tend[0]], [tstart[idx], tend[idx]])
+                # print(tstart.pop(idx), tend.pop(idx), tscore.pop(idx))
+            else:
+                # move to next
+                idx += 1
+        rstart.append(tstart.pop(0))
+        rend.append(tend.pop(0))
+        rscore.append(tscore.pop(0))
+
+    if len(rscore) < max_after_nms and len(tstart) >= 1:  # add the last, possibly empty.
+        rstart.append(tstart.pop(0))
+        rend.append(tend.pop(0))
+        rscore.append(tscore.pop(0))
+
+    predictions_after_nms = [[st, ed, s] for s, st, ed in zip(rscore, rstart, rend)]
+    return predictions_after_nms
diff --git a/utils/tensor_utils.py b/utils/tensor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..72497127fdbbd935bfc8c42b5fae723db04d73f8
--- /dev/null
+++ b/utils/tensor_utils.py
@@ -0,0 +1,141 @@
+import numpy as np
+import torch
+
+
+def pad_sequences_1d(sequences, dtype=torch.long, device=torch.device("cpu"), fixed_length=None):
+    """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
+    into a (n+1)-d array, only allow the first dim has variable lengths.
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: np.dtype or torch.dtype
+        device:
+        fixed_length: pad all seq in sequences to fixed length. All seq should have a length <= fixed_length.
+            return will be of shape [len(sequences), fixed_length, ...]
+    Returns:
+        padded_seqs: ((n+1)-d tensor) padded with zeros
+        mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
+              1 indicate valid, 0 otherwise
+    Examples:
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=torch.long)
+        >>> test_data_3d = [torch.randn(2,3,4), torch.randn(4,3,4), torch.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=torch.float)
+        >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
+        >>> pad_sequences_1d(test_data_list, dtype=np.float32)
+        >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
+        >>> pad_sequences_1d(test_data_3d, dtype=np.float32)
+    """
+    if isinstance(sequences[0], list):
+        if "torch" in str(dtype):
+            sequences = [torch.tensor(s, dtype=dtype, device=device) for s in sequences]
+        else:
+            sequences = [np.asarray(s, dtype=dtype) for s in sequences]
+
+    extra_dims = sequences[0].shape[1:]  # the extra dims should be the same for all elements
+    lengths = [len(seq) for seq in sequences]
+    if fixed_length is not None:
+        max_length = fixed_length
+    else:
+        max_length = max(lengths)
+    if isinstance(sequences[0], torch.Tensor):
+        assert "torch" in str(dtype), "dtype and input type does not match"
+        padded_seqs = torch.zeros((len(sequences), max_length) + extra_dims, dtype=dtype, device=device)
+        mask = torch.zeros((len(sequences), max_length), dtype=torch.float32, device=device)
+    else:  # np
+        assert "numpy" in str(dtype), "dtype and input type does not match"
+        padded_seqs = np.zeros((len(sequences), max_length) + extra_dims, dtype=dtype)
+        mask = np.zeros((len(sequences), max_length), dtype=np.float32)
+
+    for idx, seq in enumerate(sequences):
+        end = lengths[idx]
+        padded_seqs[idx, :end] = seq
+        mask[idx, :end] = 1
+    return padded_seqs, mask  # , lengths
+
+
+def pad_sequences_2d(sequences, dtype=torch.long):
+    """ Pad a double-nested list or a sequence of n-d torch tensor into a (n+1)-d tensor,
+        only allow the first two dims has variable lengths
+    Args:
+        sequences: list(n-d tensor or list)
+        dtype: torch.long for word indices / torch.float (float32) for other cases
+    Returns:
+    Examples:
+        >>> test_data_list = [[[1, 3, 5], [3, 7, 4, 1]], [[98, 34, 11, 89, 90], [22], [34, 56]],]
+        >>> pad_sequences_2d(test_data_list, dtype=torch.long)  # torch.Size([2, 3, 5])
+        >>> test_data_3d = [torch.randn(2,2,4), torch.randn(4,3,4), torch.randn(1,5,4)]
+        >>> pad_sequences_2d(test_data_3d, dtype=torch.float)  # torch.Size([2, 3, 5])
+        >>> test_data_3d2 = [[torch.randn(2,4), ], [torch.randn(3,4), torch.randn(5,4)]]
+        >>> pad_sequences_2d(test_data_3d2, dtype=torch.float)  # torch.Size([2, 3, 5])
+    # TODO add support for numpy array
+    """
+    bsz = len(sequences)
+    para_lengths = [len(seq) for seq in sequences]
+    max_para_len = max(para_lengths)
+    sen_lengths = [[len(word_seq) for word_seq in seq] for seq in sequences]
+    max_sen_len = max([max(e) for e in sen_lengths])
+
+    if isinstance(sequences[0], torch.Tensor):
+        extra_dims = sequences[0].shape[2:]
+    elif isinstance(sequences[0][0], torch.Tensor):
+        extra_dims = sequences[0][0].shape[1:]
+    else:
+        sequences = [[torch.Tensor(word_seq, dtype=dtype) for word_seq in seq] for seq in sequences]
+        extra_dims = ()
+
+    padded_seqs = torch.zeros((bsz, max_para_len, max_sen_len) + extra_dims, dtype=dtype)
+    mask = torch.zeros(bsz, max_para_len, max_sen_len).float()
+
+    for b_i in range(bsz):
+        for sen_i, sen_l in enumerate(sen_lengths[b_i]):
+            padded_seqs[b_i, sen_i, :sen_l] = sequences[b_i][sen_i]
+            mask[b_i, sen_i, :sen_l] = 1
+    return padded_seqs, mask  # , sen_lengths
+
+
+def find_max_triples(st_prob, ed_prob, top_n=5, prob_thd=None, tensor_type="torch"):
+    """ Find a list of (k1, k2) where k1 < k2 with the maximum values of st_prob[k1] * ed_prob[k2]
+    Args:
+        st_prob (torch.Tensor or np.ndarray): (N, L) batched start_idx probabilities
+        ed_prob (torch.Tensor  or np.ndarray): (N, L) batched end_idx probabilities
+        top_n (int): return topN pairs with highest values
+        prob_thd (float):
+        tensor_type: str, np or torch
+    Returns:
+        batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
+    """
+    if tensor_type == "torch":
+        st_prob, ed_prob = st_prob.data.numpy(), ed_prob.data.numpy()
+    product = np.einsum("bm,bn->bmn", st_prob, ed_prob)
+    # (N, L, L) the lower part becomes zeros, start_idx < ed_idx
+    upper_product = np.triu(product, k=1)
+    return find_max_triples_from_upper_triangle_product(upper_product, top_n=top_n, prob_thd=prob_thd)
+
+
+def find_max_triples_from_upper_triangle_product(upper_product, top_n=5, prob_thd=None):
+    """ Find a list of (k1, k2) where k1 < k2 with the maximum values of p1[k1] * p2[k2]
+    Args:
+        upper_product (torch.Tensor or np.ndarray): (N, L, L), the lower part becomes zeros, end_idx > start_idx
+        top_n (int): return topN pairs with highest values
+        prob_thd (float or None):
+    Returns:
+        batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
+    """
+    batched_sorted_triple = []
+    for idx, e in enumerate(upper_product):
+        sorted_triple = top_n_array_2d(e, top_n=top_n)
+        if prob_thd is not None:
+            sorted_triple = sorted_triple[sorted_triple[2] >= prob_thd]
+        batched_sorted_triple.append(sorted_triple)
+    return batched_sorted_triple
+
+
+def top_n_array_2d(array_2d, top_n):
+    """ Get topN indices and values of a 2d array, return a tuple of indices and their values,
+    ranked by the value
+    """
+    row_indices, column_indices = np.unravel_index(np.argsort(array_2d, axis=None), array_2d.shape)
+    row_indices = row_indices[::-1][:top_n]
+    column_indices = column_indices[::-1][:top_n]
+    sorted_values = array_2d[row_indices, column_indices]
+    return np.stack([row_indices, column_indices, sorted_values], axis=1)  # (N, 3)