Add model + config files

Files changed (13) hide show

.gitattributes +1 -0
__init__.py +0 -0
cfg.py +13 -0
checkpoint_epoch_00075.pyth +3 -0
config.yml +492 -0
configuration_x3d.py +9 -0
helpers/cfg.py +1286 -0
helpers/head.py +146 -0
helpers/norm.py +110 -0
helpers/resnet.py +927 -0
helpers/stem.py +320 -0
modeling_x3d.py +15 -0
x3d.py +350 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint_epoch_00075.pyth filter=lfs diff=lfs merge=lfs -text

__init__.py ADDED Viewed

File without changes

cfg.py ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from x3d_model.helpers.cfg import get_cfg
+def load_config(path_to_config=None):
+    # Setup cfg.
+    cfg = get_cfg()
+    # Load config from cfg.
+    if path_to_config is not None:
+        cfg.merge_from_file(path_to_config)
+    return cfg

checkpoint_epoch_00075.pyth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66ea6f31835ec44a91c7df23e304f429872c091b34a5447cd62ad7f1d1b3837e
+size 43662374

config.yml ADDED Viewed

	@@ -0,0 +1,492 @@

+AUG:
+  AA_TYPE: rand-m9-mstd0.5-inc1
+  COLOR_JITTER: 0.4
+  ENABLE: false
+  GEN_MASK_LOADER: false
+  INTERPOLATION: bicubic
+  MASK_FRAMES: false
+  MASK_RATIO: 0.0
+  MASK_TUBE: false
+  MASK_WINDOW_SIZE:
+  - 8
+  - 7
+  - 7
+  MAX_MASK_PATCHES_PER_BLOCK: null
+  NUM_SAMPLE: 1
+  RE_COUNT: 1
+  RE_MODE: pixel
+  RE_PROB: 0.25
+  RE_SPLIT: false
+AVA:
+  ANNOTATION_DIR: /mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/
+  BGR: false
+  DETECTION_SCORE_THRESH: 0.9
+  EXCLUSION_FILE: ava_val_excluded_timestamps_v2.2.csv
+  FRAME_DIR: /mnt/fair-flash3-east/ava_trainval_frames.img/
+  FRAME_LIST_DIR: /mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/
+  FULL_TEST_ON_VAL: false
+  GROUNDTRUTH_FILE: ava_val_v2.2.csv
+  IMG_PROC_BACKEND: cv2
+  LABEL_MAP_FILE: ava_action_list_v2.2_for_activitynet_2019.pbtxt
+  TEST_FORCE_FLIP: false
+  TEST_LISTS:
+  - val.csv
+  TEST_PREDICT_BOX_LISTS:
+  - ava_val_predicted_boxes.csv
+  TRAIN_GT_BOX_LISTS:
+  - ava_train_v2.2.csv
+  TRAIN_LISTS:
+  - train.csv
+  TRAIN_PCA_JITTER_ONLY: true
+  TRAIN_PREDICT_BOX_LISTS: []
+  TRAIN_USE_COLOR_AUGMENTATION: false
+BENCHMARK:
+  LOG_PERIOD: 100
+  NUM_EPOCHS: 5
+  SHUFFLE: true
+BN:
+  GLOBAL_SYNC: false
+  NORM_TYPE: sync_batchnorm
+  NUM_BATCHES_PRECISE: 200
+  NUM_SPLITS: 1
+  NUM_SYNC_DEVICES: 1
+  USE_PRECISE_STATS: true
+  WEIGHT_DECAY: 0.0
+CONTRASTIVE:
+  BN_MLP: false
+  BN_SYNC_MLP: false
+  DELTA_CLIPS_MAX: .inf
+  DELTA_CLIPS_MIN: -.inf
+  DIM: 128
+  INTERP_MEMORY: false
+  KNN_ON: true
+  LENGTH: 239975
+  LOCAL_SHUFFLE_BN: true
+  MEM_TYPE: 1d
+  MLP_DIM: 2048
+  MOCO_MULTI_VIEW_QUEUE: false
+  MOMENTUM: 0.5
+  MOMENTUM_ANNEALING: false
+  NUM_CLASSES_DOWNSTREAM: 400
+  NUM_MLP_LAYERS: 1
+  PREDICTOR_DEPTHS: []
+  QUEUE_LEN: 65536
+  SEQUENTIAL: false
+  SIMCLR_DIST_ON: true
+  SWAV_QEUE_LEN: 0
+  T: 0.07
+  TYPE: mem
+DATA:
+  COLOR_RND_GRAYSCALE: 0.0
+  DECODING_BACKEND: torchvision
+  DECODING_SHORT_SIZE: 256
+  DUMMY_LOAD: false
+  ENSEMBLE_METHOD: max
+  IN22K_TRAINVAL: false
+  IN22k_VAL_IN1K: ''
+  INPUT_CHANNEL_NUM:
+  - 3
+  INV_UNIFORM_SAMPLE: true
+  IN_VAL_CROP_RATIO: 0.875
+  LOADER_CHUNK_OVERALL_SIZE: 0
+  LOADER_CHUNK_SIZE: 0
+  MEAN:
+  - 0.45
+  - 0.45
+  - 0.45
+  MULTI_LABEL: true
+  NUM_FRAMES: 16
+  PATH_LABEL_SEPARATOR: ' '
+  PATH_PREFIX: kabr/KABR/dataset/image
+  PATH_TO_DATA_DIR: kabr/KABR/annotation
+  PATH_TO_PRELOAD_IMDB: ''
+  RANDOM_FLIP: true
+  REVERSE_INPUT_CHANNEL: true
+  SAMPLING_RATE: 5
+  SKIP_ROWS: 0
+  SSL_BLUR_SIGMA_MAX:
+  - 0.0
+  - 2.0
+  SSL_BLUR_SIGMA_MIN:
+  - 0.0
+  - 0.1
+  SSL_COLOR_BRI_CON_SAT:
+  - 0.2
+  - 0.2
+  - 0.2
+  SSL_COLOR_HUE: 0.1
+  SSL_COLOR_JITTER: true
+  SSL_MOCOV2_AUG: false
+  STD:
+  - 0.225
+  - 0.225
+  - 0.225
+  TARGET_FPS: 30
+  TEST_CROP_SIZE: 300
+  TIME_DIFF_PROB: 0.0
+  TRAIN_CROP_NUM_SPATIAL: 1
+  TRAIN_CROP_NUM_TEMPORAL: 1
+  TRAIN_CROP_SIZE: 300
+  TRAIN_JITTER_ASPECT_RELATIVE: []
+  TRAIN_JITTER_FPS: 0.0
+  TRAIN_JITTER_MOTION_SHIFT: false
+  TRAIN_JITTER_SCALES:
+  - 300
+  - 400
+  TRAIN_JITTER_SCALES_RELATIVE: []
+  TRAIN_PCA_EIGVAL:
+  - 0.225
+  - 0.224
+  - 0.229
+  TRAIN_PCA_EIGVEC:
+  - - -0.5675
+    - 0.7192
+    - 0.4009
+  - - -0.5808
+    - -0.0045
+    - -0.814
+  - - -0.5836
+    - -0.6948
+    - 0.4203
+  USE_OFFSET_SAMPLING: false
+DATA_LOADER:
+  ENABLE_MULTI_THREAD_DECODE: false
+  NUM_WORKERS: 8
+  PIN_MEMORY: true
+DEMO:
+  BUFFER_SIZE: 0
+  CLIP_VIS_SIZE: 10
+  COMMON_CLASS_NAMES:
+  - watch (a person)
+  - talk to (e.g., self, a person, a group)
+  - listen to (a person)
+  - touch (an object)
+  - carry/hold (an object)
+  - walk
+  - sit
+  - lie/sleep
+  - bend/bow (at the waist)
+  COMMON_CLASS_THRES: 0.7
+  DETECTRON2_CFG: COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
+  DETECTRON2_THRESH: 0.9
+  DETECTRON2_WEIGHTS: detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl
+  DISPLAY_HEIGHT: 0
+  DISPLAY_WIDTH: 0
+  ENABLE: false
+  FPS: 30
+  GT_BOXES: ''
+  INPUT_FORMAT: BGR
+  INPUT_VIDEO: kabr/KABR/dataset/video/G0103.mp4
+  LABEL_FILE_PATH: kabr/KABR/annotation/classes.json
+  NUM_CLIPS_SKIP: 1
+  NUM_VIS_INSTANCES: 1
+  OUTPUT_FILE: kabr/KABR/dataset/predict/G0103.mp4
+  OUTPUT_FPS: -1
+  PREDS_BOXES: ''
+  SLOWMO: 1
+  STARTING_SECOND: 900
+  THREAD_ENABLE: false
+  UNCOMMON_CLASS_THRES: 0.3
+  VIS_MODE: thres
+  WEBCAM: -1
+DETECTION:
+  ALIGNED: true
+  ENABLE: false
+  ROI_XFORM_RESOLUTION: 7
+  SPATIAL_SCALE_FACTOR: 16
+DIST_BACKEND: nccl
+LOG_MODEL_INFO: true
+LOG_PERIOD: 10
+MASK:
+  DECODER_DEPTH: 0
+  DECODER_EMBED_DIM: 512
+  DECODER_SEP_POS_EMBED: false
+  DEC_KV_KERNEL: []
+  DEC_KV_STRIDE: []
+  ENABLE: false
+  HEAD_TYPE: separate
+  MAE_ON: false
+  MAE_RND_MASK: false
+  NORM_PRED_PIXEL: true
+  PER_FRAME_MASKING: false
+  PRED_HOG: false
+  PRETRAIN_DEPTH:
+  - 15
+  SCALE_INIT_BY_DEPTH: false
+  TIME_STRIDE_LOSS: true
+MIXUP:
+  ALPHA: 0.8
+  CUTMIX_ALPHA: 1.0
+  ENABLE: false
+  LABEL_SMOOTH_VALUE: 0.1
+  PROB: 1.0
+  SWITCH_PROB: 0.5
+MODEL:
+  ACT_CHECKPOINT: false
+  ARCH: x3d
+  DETACH_FINAL_FC: false
+  DROPCONNECT_RATE: 0.0
+  DROPOUT_RATE: 0.5
+  FC_INIT_STD: 0.01
+  FP16_ALLREDUCE: false
+  FROZEN_BN: false
+  HEAD_ACT: sigmoid
+  LOSS_FUNC: EQL
+  MODEL_NAME: X3D
+  MULTI_PATHWAY_ARCH:
+  - slowfast
+  NUM_CLASSES: 8
+  SINGLE_PATHWAY_ARCH:
+  - 2d
+  - c2d
+  - i3d
+  - slow
+  - x3d
+  - mvit
+  - maskmvit
+MULTIGRID:
+  BN_BASE_SIZE: 8
+  DEFAULT_B: 0
+  DEFAULT_S: 0
+  DEFAULT_T: 0
+  EPOCH_FACTOR: 1.5
+  EVAL_FREQ: 3
+  LONG_CYCLE: false
+  LONG_CYCLE_FACTORS:
+  - - 0.25
+    - 0.7071067811865476
+  - - 0.5
+    - 0.7071067811865476
+  - - 0.5
+    - 1
+  - - 1
+    - 1
+  LONG_CYCLE_SAMPLING_RATE: 0
+  SHORT_CYCLE: false
+  SHORT_CYCLE_FACTORS:
+  - 0.5
+  - 0.7071067811865476
+MVIT:
+  CLS_EMBED_ON: true
+  DEPTH: 16
+  DIM_MUL: []
+  DIM_MUL_IN_ATT: false
+  DROPOUT_RATE: 0.0
+  DROPPATH_RATE: 0.1
+  EMBED_DIM: 96
+  HEAD_INIT_SCALE: 1.0
+  HEAD_MUL: []
+  LAYER_SCALE_INIT_VALUE: 0.0
+  MLP_RATIO: 4.0
+  MODE: conv
+  NORM: layernorm
+  NORM_STEM: false
+  NUM_HEADS: 1
+  PATCH_2D: false
+  PATCH_KERNEL:
+  - 3
+  - 7
+  - 7
+  PATCH_PADDING:
+  - 2
+  - 4
+  - 4
+  PATCH_STRIDE:
+  - 2
+  - 4
+  - 4
+  POOL_FIRST: false
+  POOL_KVQ_KERNEL: null
+  POOL_KV_STRIDE: []
+  POOL_KV_STRIDE_ADAPTIVE: null
+  POOL_Q_STRIDE: []
+  QKV_BIAS: true
+  REL_POS_SPATIAL: false
+  REL_POS_TEMPORAL: false
+  REL_POS_ZERO_INIT: false
+  RESIDUAL_POOLING: false
+  REV:
+    BUFFER_LAYERS: []
+    ENABLE: false
+    PRE_Q_FUSION: avg
+    RESPATH_FUSE: concat
+    RES_PATH: conv
+  SEPARATE_QKV: false
+  SEP_POS_EMBED: false
+  USE_ABS_POS: true
+  USE_FIXED_SINCOS_POS: false
+  USE_MEAN_POOLING: false
+  ZERO_DECAY_POS_CLS: true
+NONLOCAL:
+  GROUP:
+  - - 1
+  - - 1
+  - - 1
+  - - 1
+  INSTANTIATION: dot_product
+  LOCATION:
+  - - []
+  - - []
+  - - []
+  - - []
+  POOL:
+  - - - 1
+      - 2
+      - 2
+    - - 1
+      - 2
+      - 2
+  - - - 1
+      - 2
+      - 2
+    - - 1
+      - 2
+      - 2
+  - - - 1
+      - 2
+      - 2
+    - - 1
+      - 2
+      - 2
+  - - - 1
+      - 2
+      - 2
+    - - 1
+      - 2
+      - 2
+NUM_GPUS: 8
+NUM_SHARDS: 1
+OUTPUT_DIR: kabr/KABR/logs/x3d-l-kabr
+RESNET:
+  DEPTH: 50
+  INPLACE_RELU: true
+  NUM_BLOCK_TEMP_KERNEL:
+  - - 3
+  - - 4
+  - - 6
+  - - 3
+  NUM_GROUPS: 1
+  SPATIAL_DILATIONS:
+  - - 1
+  - - 1
+  - - 1
+  - - 1
+  SPATIAL_STRIDES:
+  - - 1
+  - - 2
+  - - 2
+  - - 2
+  STRIDE_1X1: false
+  TRANS_FUNC: x3d_transform
+  WIDTH_PER_GROUP: 64
+  ZERO_INIT_FINAL_BN: true
+  ZERO_INIT_FINAL_CONV: false
+RNG_SEED: 0
+SHARD_ID: 0
+SLOWFAST:
+  ALPHA: 8
+  BETA_INV: 8
+  FUSION_CONV_CHANNEL_RATIO: 2
+  FUSION_KERNEL_SZ: 5
+SOLVER:
+  BASE_LR: 0.05
+  BASE_LR_SCALE_NUM_SHARDS: true
+  BETAS:
+  - 0.9
+  - 0.999
+  CLIP_GRAD_L2NORM: null
+  CLIP_GRAD_VAL: null
+  COSINE_AFTER_WARMUP: false
+  COSINE_END_LR: 0.0
+  DAMPENING: 0.0
+  GAMMA: 0.1
+  LARS_ON: false
+  LAYER_DECAY: 1.0
+  LRS: []
+  LR_POLICY: cosine
+  MAX_EPOCH: 120
+  MOMENTUM: 0.9
+  NESTEROV: true
+  OPTIMIZING_METHOD: sgd
+  STEPS: []
+  STEP_SIZE: 1
+  WARMUP_EPOCHS: 35.0
+  WARMUP_FACTOR: 0.1
+  WARMUP_START_LR: 0.01
+  WEIGHT_DECAY: 5.0e-05
+  ZERO_WD_1D_PARAM: false
+TASK: ''
+TENSORBOARD:
+  CATEGORIES_PATH: ''
+  CLASS_NAMES_PATH: kabr/KABR/annotation/classes.json
+  CONFUSION_MATRIX:
+    ENABLE: true
+    FIGSIZE:
+    - 8
+    - 8
+    SUBSET_PATH: kabr/KABR/annotation/classes.txt
+  ENABLE: true
+  HISTOGRAM:
+    ENABLE: true
+    FIGSIZE:
+    - 8
+    - 8
+    SUBSET_PATH: kabr/KABR/annotation/classes.txt
+    TOPK: 3
+  LOG_DIR: ''
+  MODEL_VIS:
+    ACTIVATIONS: true
+    COLORMAP: Pastel2
+    ENABLE: true
+    GRAD_CAM:
+      COLORMAP: viridis
+      ENABLE: true
+      LAYER_LIST:
+      - s5/pathway0_res14
+      USE_TRUE_LABEL: false
+    INPUT_VIDEO: true
+    LAYER_LIST:
+    - s5/pathway0_res14
+    MODEL_WEIGHTS: true
+    TOPK_PREDS: 1
+  PREDICTIONS_PATH: ''
+  WRONG_PRED_VIS:
+    ENABLE: false
+    SUBSET_PATH: ''
+    TAG: Incorrectly classified videos.
+TEST:
+  BATCH_SIZE: 64
+  CHECKPOINT_FILE_PATH: ''
+  CHECKPOINT_TYPE: pytorch
+  DATASET: charades
+  ENABLE: false
+  NUM_ENSEMBLE_VIEWS: 2
+  NUM_SPATIAL_CROPS: 1
+  NUM_TEMPORAL_CLIPS: []
+  SAVE_RESULTS_PATH: kabr/KABR/logs/x3d-l-kabr/results.txt
+TRAIN:
+  AUTO_RESUME: true
+  BATCH_SIZE: 64
+  CHECKPOINT_CLEAR_NAME_PATTERN: []
+  CHECKPOINT_EPOCH_RESET: true
+  CHECKPOINT_FILE_PATH: slowfast/projects/x3d/x3d_l.pyth
+  CHECKPOINT_INFLATE: false
+  CHECKPOINT_IN_INIT: false
+  CHECKPOINT_PERIOD: 5
+  CHECKPOINT_TYPE: pytorch
+  DATASET: charades
+  ENABLE: true
+  EVAL_PERIOD: 5
+  KILL_LOSS_EXPLOSION_FACTOR: 0.0
+  MIXED_PRECISION: false
+VIS_MASK:
+  ENABLE: false
+X3D:
+  BN_LIN5: false
+  BOTTLENECK_FACTOR: 2.25
+  CHANNELWISE_3x3x3: true
+  DEPTH_FACTOR: 5.0
+  DIM_C1: 12
+  DIM_C5: 2048
+  SCALE_RES2: false
+  WIDTH_FACTOR: 2.0

configuration_x3d.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from transformers import PretrainedConfig
+from x3d_model.cfg import load_config
+class X3DConfig(PretrainedConfig):
+    model_type = "x3d"
+    def __init__(self, path: str = None, **kwargs):
+        super().__init__(**kwargs)
+        self.cfg = load_config(path)

helpers/cfg.py ADDED Viewed

	@@ -0,0 +1,1286 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Configs."""
+import math
+from fvcore.common.config import CfgNode
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+_C = CfgNode()
+# -----------------------------------------------------------------------------
+# Contrastive Model (for MoCo, SimCLR, SwAV, BYOL)
+# -----------------------------------------------------------------------------
+_C.CONTRASTIVE = CfgNode()
+# temperature used for contrastive losses
+_C.CONTRASTIVE.T = 0.07
+# output dimension for the loss
+_C.CONTRASTIVE.DIM = 128
+# number of training samples (for kNN bank)
+_C.CONTRASTIVE.LENGTH = 239975
+# the length of MoCo's and MemBanks' queues
+_C.CONTRASTIVE.QUEUE_LEN = 65536
+# momentum for momentum encoder updates
+_C.CONTRASTIVE.MOMENTUM = 0.5
+# wether to anneal momentum to value above with cosine schedule
+_C.CONTRASTIVE.MOMENTUM_ANNEALING = False
+# either memorybank, moco, simclr, byol, swav
+_C.CONTRASTIVE.TYPE = "mem"
+# wether to interpolate memorybank in time
+_C.CONTRASTIVE.INTERP_MEMORY = False
+# 1d or 2d (+temporal) memory
+_C.CONTRASTIVE.MEM_TYPE = "1d"
+# number of classes for online kNN evaluation
+_C.CONTRASTIVE.NUM_CLASSES_DOWNSTREAM = 400
+# use an MLP projection with these num layers
+_C.CONTRASTIVE.NUM_MLP_LAYERS = 1
+# dimension of projection and predictor MLPs
+_C.CONTRASTIVE.MLP_DIM = 2048
+# use BN in projection/prediction MLP
+_C.CONTRASTIVE.BN_MLP = False
+# use synchronized BN in projection/prediction MLP
+_C.CONTRASTIVE.BN_SYNC_MLP = False
+# shuffle BN only locally vs. across machines
+_C.CONTRASTIVE.LOCAL_SHUFFLE_BN = True
+# Wether to fill multiple clips (or just the first) into queue
+_C.CONTRASTIVE.MOCO_MULTI_VIEW_QUEUE = False
+# if sampling multiple clips per vid they need to be at least min frames apart
+_C.CONTRASTIVE.DELTA_CLIPS_MIN = -math.inf
+# if sampling multiple clips per vid they can be max frames apart
+_C.CONTRASTIVE.DELTA_CLIPS_MAX = math.inf
+# if non empty, use predictors with depth specified
+_C.CONTRASTIVE.PREDICTOR_DEPTHS = []
+# Wether to sequentially process multiple clips (=lower mem usage) or batch them
+_C.CONTRASTIVE.SEQUENTIAL = False
+# Wether to perform SimCLR loss across machines (or only locally)
+_C.CONTRASTIVE.SIMCLR_DIST_ON = True
+# Length of queue used in SwAV
+_C.CONTRASTIVE.SWAV_QEUE_LEN = 0
+# Wether to run online kNN evaluation during training
+_C.CONTRASTIVE.KNN_ON = True
+# ---------------------------------------------------------------------------- #
+# Batch norm options
+# ---------------------------------------------------------------------------- #
+_C.BN = CfgNode()
+# Precise BN stats.
+_C.BN.USE_PRECISE_STATS = False
+# Number of samples use to compute precise bn.
+_C.BN.NUM_BATCHES_PRECISE = 200
+# Weight decay value that applies on BN.
+_C.BN.WEIGHT_DECAY = 0.0
+# Norm type, options include `batchnorm`, `sub_batchnorm`, `sync_batchnorm`
+_C.BN.NORM_TYPE = "batchnorm"
+# Parameter for SubBatchNorm, where it splits the batch dimension into
+# NUM_SPLITS splits, and run BN on each of them separately independently.
+_C.BN.NUM_SPLITS = 1
+# Parameter for NaiveSyncBatchNorm, where the stats across `NUM_SYNC_DEVICES`
+# devices will be synchronized. `NUM_SYNC_DEVICES` cannot be larger than number of
+# devices per machine; if global sync is desired, set `GLOBAL_SYNC`.
+# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting
+# CONTRASTIVE.BN_SYNC_MLP if appropriate.
+_C.BN.NUM_SYNC_DEVICES = 1
+# Parameter for NaiveSyncBatchNorm. Setting `GLOBAL_SYNC` to True synchronizes
+# stats across all devices, across all machines; in this case, `NUM_SYNC_DEVICES`
+# must be set to None.
+# By default ONLY applies to NaiveSyncBatchNorm3d; consider also setting
+# CONTRASTIVE.BN_SYNC_MLP if appropriate.
+_C.BN.GLOBAL_SYNC = False
+# ---------------------------------------------------------------------------- #
+# Training options.
+# ---------------------------------------------------------------------------- #
+_C.TRAIN = CfgNode()
+# If True Train the model, else skip training.
+_C.TRAIN.ENABLE = True
+# Kill training if loss explodes over this ratio from the previous 5 measurements.
+# Only enforced if > 0.0
+_C.TRAIN.KILL_LOSS_EXPLOSION_FACTOR = 0.0
+# Dataset.
+_C.TRAIN.DATASET = "kinetics"
+# Total mini-batch size.
+_C.TRAIN.BATCH_SIZE = 64
+# Evaluate model on test data every eval period epochs.
+_C.TRAIN.EVAL_PERIOD = 10
+# Save model checkpoint every checkpoint period epochs.
+_C.TRAIN.CHECKPOINT_PERIOD = 10
+# Resume training from the latest checkpoint in the output directory.
+_C.TRAIN.AUTO_RESUME = True
+# Path to the checkpoint to load the initial weight.
+_C.TRAIN.CHECKPOINT_FILE_PATH = ""
+# Checkpoint types include `caffe2` or `pytorch`.
+_C.TRAIN.CHECKPOINT_TYPE = "pytorch"
+# If True, perform inflation when loading checkpoint.
+_C.TRAIN.CHECKPOINT_INFLATE = False
+# If True, reset epochs when loading checkpoint.
+_C.TRAIN.CHECKPOINT_EPOCH_RESET = False
+# If set, clear all layer names according to the pattern provided.
+_C.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN = ()  # ("backbone.",)
+# If True, use FP16 for activations
+_C.TRAIN.MIXED_PRECISION = False
+# if True, inflate some params from imagenet model.
+_C.TRAIN.CHECKPOINT_IN_INIT = False
+# ---------------------------------------------------------------------------- #
+# Augmentation options.
+# ---------------------------------------------------------------------------- #
+_C.AUG = CfgNode()
+# Whether to enable randaug.
+_C.AUG.ENABLE = False
+# Number of repeated augmentations to used during training.
+# If this is greater than 1, then the actual batch size is
+# TRAIN.BATCH_SIZE * AUG.NUM_SAMPLE.
+_C.AUG.NUM_SAMPLE = 1
+# Not used if using randaug.
+_C.AUG.COLOR_JITTER = 0.4
+# RandAug parameters.
+_C.AUG.AA_TYPE = "rand-m9-mstd0.5-inc1"
+# Interpolation method.
+_C.AUG.INTERPOLATION = "bicubic"
+# Probability of random erasing.
+_C.AUG.RE_PROB = 0.25
+# Random erasing mode.
+_C.AUG.RE_MODE = "pixel"
+# Random erase count.
+_C.AUG.RE_COUNT = 1
+# Do not random erase first (clean) augmentation split.
+_C.AUG.RE_SPLIT = False
+# Whether to generate input mask during image processing.
+_C.AUG.GEN_MASK_LOADER = False
+# If True, masking mode is "tube". Default is "cube".
+_C.AUG.MASK_TUBE = False
+# If True, masking mode is "frame". Default is "cube".
+_C.AUG.MASK_FRAMES = False
+# The size of generated masks.
+_C.AUG.MASK_WINDOW_SIZE = [8, 7, 7]
+# The ratio of masked tokens out of all tokens. Also applies to MViT supervised training
+_C.AUG.MASK_RATIO = 0.0
+# The maximum number of a masked block. None means no maximum limit. (Used only in image MaskFeat.)
+_C.AUG.MAX_MASK_PATCHES_PER_BLOCK = None
+# ---------------------------------------------------------------------------- #
+# Masked pretraining visualization options.
+# ---------------------------------------------------------------------------- #
+_C.VIS_MASK = CfgNode()
+# Whether to do visualization.
+_C.VIS_MASK.ENABLE = False
+# ---------------------------------------------------------------------------- #
+# MipUp options.
+# ---------------------------------------------------------------------------- #
+_C.MIXUP = CfgNode()
+# Whether to use mixup.
+_C.MIXUP.ENABLE = False
+# Mixup alpha.
+_C.MIXUP.ALPHA = 0.8
+# Cutmix alpha.
+_C.MIXUP.CUTMIX_ALPHA = 1.0
+# Probability of performing mixup or cutmix when either/both is enabled.
+_C.MIXUP.PROB = 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled.
+_C.MIXUP.SWITCH_PROB = 0.5
+# Label smoothing.
+_C.MIXUP.LABEL_SMOOTH_VALUE = 0.1
+# ---------------------------------------------------------------------------- #
+# Testing options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CfgNode()
+# If True test the model, else skip the testing.
+_C.TEST.ENABLE = True
+# Dataset for testing.
+_C.TEST.DATASET = "kinetics"
+# Total mini-batch size
+_C.TEST.BATCH_SIZE = 8
+# Path to the checkpoint to load the initial weight.
+_C.TEST.CHECKPOINT_FILE_PATH = ""
+# Number of clips to sample from a video uniformly for aggregating the
+# prediction results.
+_C.TEST.NUM_ENSEMBLE_VIEWS = 10
+# Number of crops to sample from a frame spatially for aggregating the
+# prediction results.
+_C.TEST.NUM_SPATIAL_CROPS = 3
+# Checkpoint types include `caffe2` or `pytorch`.
+_C.TEST.CHECKPOINT_TYPE = "pytorch"
+# Path to saving prediction results file.
+_C.TEST.SAVE_RESULTS_PATH = ""
+_C.TEST.NUM_TEMPORAL_CLIPS = []
+# -----------------------------------------------------------------------------
+# ResNet options
+# -----------------------------------------------------------------------------
+_C.RESNET = CfgNode()
+# Transformation function.
+_C.RESNET.TRANS_FUNC = "bottleneck_transform"
+# Number of groups. 1 for ResNet, and larger than 1 for ResNeXt).
+_C.RESNET.NUM_GROUPS = 1
+# Width of each group (64 -> ResNet; 4 -> ResNeXt).
+_C.RESNET.WIDTH_PER_GROUP = 64
+# Apply relu in a inplace manner.
+_C.RESNET.INPLACE_RELU = True
+# Apply stride to 1x1 conv.
+_C.RESNET.STRIDE_1X1 = False
+#  If true, initialize the gamma of the final BN of each block to zero.
+_C.RESNET.ZERO_INIT_FINAL_BN = False
+#  If true, initialize the final conv layer of each block to zero.
+_C.RESNET.ZERO_INIT_FINAL_CONV = False
+# Number of weight layers.
+_C.RESNET.DEPTH = 50
+# If the current block has more than NUM_BLOCK_TEMP_KERNEL blocks, use temporal
+# kernel of 1 for the rest of the blocks.
+_C.RESNET.NUM_BLOCK_TEMP_KERNEL = [[3], [4], [6], [3]]
+# Size of stride on different res stages.
+_C.RESNET.SPATIAL_STRIDES = [[1], [2], [2], [2]]
+# Size of dilation on different res stages.
+_C.RESNET.SPATIAL_DILATIONS = [[1], [1], [1], [1]]
+# ---------------------------------------------------------------------------- #
+# X3D  options
+# See https://arxiv.org/abs/2004.04730 for details about X3D Networks.
+# ---------------------------------------------------------------------------- #
+_C.X3D = CfgNode()
+# Width expansion factor.
+_C.X3D.WIDTH_FACTOR = 1.0
+# Depth expansion factor.
+_C.X3D.DEPTH_FACTOR = 1.0
+# Bottleneck expansion factor for the 3x3x3 conv.
+_C.X3D.BOTTLENECK_FACTOR = 1.0  #
+# Dimensions of the last linear layer before classificaiton.
+_C.X3D.DIM_C5 = 2048
+# Dimensions of the first 3x3 conv layer.
+_C.X3D.DIM_C1 = 12
+# Whether to scale the width of Res2, default is false.
+_C.X3D.SCALE_RES2 = False
+# Whether to use a BatchNorm (BN) layer before the classifier, default is false.
+_C.X3D.BN_LIN5 = False
+# Whether to use channelwise (=depthwise) convolution in the center (3x3x3)
+# convolution operation of the residual blocks.
+_C.X3D.CHANNELWISE_3x3x3 = True
+# -----------------------------------------------------------------------------
+# Nonlocal options
+# -----------------------------------------------------------------------------
+_C.NONLOCAL = CfgNode()
+# Index of each stage and block to add nonlocal layers.
+_C.NONLOCAL.LOCATION = [[[]], [[]], [[]], [[]]]
+# Number of group for nonlocal for each stage.
+_C.NONLOCAL.GROUP = [[1], [1], [1], [1]]
+# Instatiation to use for non-local layer.
+_C.NONLOCAL.INSTANTIATION = "dot_product"
+# Size of pooling layers used in Non-Local.
+_C.NONLOCAL.POOL = [
+    # Res2
+    [[1, 2, 2], [1, 2, 2]],
+    # Res3
+    [[1, 2, 2], [1, 2, 2]],
+    # Res4
+    [[1, 2, 2], [1, 2, 2]],
+    # Res5
+    [[1, 2, 2], [1, 2, 2]],
+]
+# -----------------------------------------------------------------------------
+# Model options
+# -----------------------------------------------------------------------------
+_C.MODEL = CfgNode()
+# Model architecture.
+_C.MODEL.ARCH = "slowfast"
+# Model name
+_C.MODEL.MODEL_NAME = "SlowFast"
+# The number of classes to predict for the model.
+_C.MODEL.NUM_CLASSES = 400
+# Loss function.
+_C.MODEL.LOSS_FUNC = "cross_entropy"
+# Model architectures that has one single pathway.
+_C.MODEL.SINGLE_PATHWAY_ARCH = [
+    "2d",
+    "c2d",
+    "i3d",
+    "slow",
+    "x3d",
+    "mvit",
+    "maskmvit",
+]
+# Model architectures that has multiple pathways.
+_C.MODEL.MULTI_PATHWAY_ARCH = ["slowfast"]
+# Dropout rate before final projection in the backbone.
+_C.MODEL.DROPOUT_RATE = 0.5
+# Randomly drop rate for Res-blocks, linearly increase from res2 to res5
+_C.MODEL.DROPCONNECT_RATE = 0.0
+# The std to initialize the fc layer(s).
+_C.MODEL.FC_INIT_STD = 0.01
+# Activation layer for the output head.
+_C.MODEL.HEAD_ACT = "softmax"
+# Activation checkpointing enabled or not to save GPU memory.
+_C.MODEL.ACT_CHECKPOINT = False
+# If True, detach the final fc layer from the network, by doing so, only the
+# final fc layer will be trained.
+_C.MODEL.DETACH_FINAL_FC = False
+# If True, frozen batch norm stats during training.
+_C.MODEL.FROZEN_BN = False
+# If True, AllReduce gradients are compressed to fp16
+_C.MODEL.FP16_ALLREDUCE = False
+# -----------------------------------------------------------------------------
+# MViT options
+# -----------------------------------------------------------------------------
+_C.MVIT = CfgNode()
+# Options include `conv`, `max`.
+_C.MVIT.MODE = "conv"
+# If True, perform pool before projection in attention.
+_C.MVIT.POOL_FIRST = False
+# If True, use cls embed in the network, otherwise don't use cls_embed in transformer.
+_C.MVIT.CLS_EMBED_ON = True
+# Kernel size for patchtification.
+_C.MVIT.PATCH_KERNEL = [3, 7, 7]
+# Stride size for patchtification.
+_C.MVIT.PATCH_STRIDE = [2, 4, 4]
+# Padding size for patchtification.
+_C.MVIT.PATCH_PADDING = [2, 4, 4]
+# If True, use 2d patch, otherwise use 3d patch.
+_C.MVIT.PATCH_2D = False
+# Base embedding dimension for the transformer.
+_C.MVIT.EMBED_DIM = 96
+# Base num of heads for the transformer.
+_C.MVIT.NUM_HEADS = 1
+# Dimension reduction ratio for the MLP layers.
+_C.MVIT.MLP_RATIO = 4.0
+# If use, use bias term in attention fc layers.
+_C.MVIT.QKV_BIAS = True
+# Drop path rate for the tranfomer.
+_C.MVIT.DROPPATH_RATE = 0.1
+# The initial value of layer scale gamma. Set 0.0 to disable layer scale.
+_C.MVIT.LAYER_SCALE_INIT_VALUE = 0.0
+# Depth of the transformer.
+_C.MVIT.DEPTH = 16
+# Normalization layer for the transformer. Only layernorm is supported now.
+_C.MVIT.NORM = "layernorm"
+# Dimension multiplication at layer i. If 2.0 is used, then the next block will increase
+# the dimension by 2 times. Format: [depth_i: mul_dim_ratio]
+_C.MVIT.DIM_MUL = []
+# Head number multiplication at layer i. If 2.0 is used, then the next block will
+# increase the number of heads by 2 times. Format: [depth_i: head_mul_ratio]
+_C.MVIT.HEAD_MUL = []
+# Stride size for the Pool KV at layer i.
+# Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,]
+_C.MVIT.POOL_KV_STRIDE = []
+# Initial stride size for KV at layer 1. The stride size will be further reduced with
+# the raio of MVIT.DIM_MUL. If will overwrite MVIT.POOL_KV_STRIDE if not None.
+_C.MVIT.POOL_KV_STRIDE_ADAPTIVE = None
+# Stride size for the Pool Q at layer i.
+# Format: [[i, stride_t_i, stride_h_i, stride_w_i], ...,]
+_C.MVIT.POOL_Q_STRIDE = []
+# If not None, overwrite the KV_KERNEL and Q_KERNEL size with POOL_KVQ_CONV_SIZ.
+# Otherwise the kernel_size is [s + 1 if s > 1 else s for s in stride_size].
+_C.MVIT.POOL_KVQ_KERNEL = None
+# If True, perform no decay on positional embedding and cls embedding.
+_C.MVIT.ZERO_DECAY_POS_CLS = True
+# If True, use norm after stem.
+_C.MVIT.NORM_STEM = False
+# If True, perform separate positional embedding.
+_C.MVIT.SEP_POS_EMBED = False
+# Dropout rate for the MViT backbone.
+_C.MVIT.DROPOUT_RATE = 0.0
+# If True, use absolute positional embedding.
+_C.MVIT.USE_ABS_POS = True
+# If True, use relative positional embedding for spatial dimentions
+_C.MVIT.REL_POS_SPATIAL = False
+# If True, use relative positional embedding for temporal dimentions
+_C.MVIT.REL_POS_TEMPORAL = False
+# If True, init rel with zero
+_C.MVIT.REL_POS_ZERO_INIT = False
+# If True, using Residual Pooling connection
+_C.MVIT.RESIDUAL_POOLING = False
+# Dim mul in qkv linear layers of attention block instead of MLP
+_C.MVIT.DIM_MUL_IN_ATT = False
+# If True, using separate linear layers for Q, K, V in attention blocks.
+_C.MVIT.SEPARATE_QKV = False
+# The initialization scale factor for the head parameters.
+_C.MVIT.HEAD_INIT_SCALE = 1.0
+# Whether to use the mean pooling of all patch tokens as the output.
+_C.MVIT.USE_MEAN_POOLING = False
+# If True, use frozen sin cos positional embedding.
+_C.MVIT.USE_FIXED_SINCOS_POS = False
+# -----------------------------------------------------------------------------
+# Masked pretraining options
+# -----------------------------------------------------------------------------
+_C.MASK = CfgNode()
+# Whether to enable Masked style pretraining.
+_C.MASK.ENABLE = False
+# Whether to enable MAE (discard encoder tokens).
+_C.MASK.MAE_ON = False
+# Whether to enable random masking in mae
+_C.MASK.MAE_RND_MASK = False
+# Whether to do random masking per-frame in mae
+_C.MASK.PER_FRAME_MASKING = False
+# only predict loss on temporal strided patches, or predict full time extent
+_C.MASK.TIME_STRIDE_LOSS = True
+# Whether to normalize the pred pixel loss
+_C.MASK.NORM_PRED_PIXEL = True
+# Whether to fix initialization with inverse depth of layer for pretraining.
+_C.MASK.SCALE_INIT_BY_DEPTH = False
+# Base embedding dimension for the decoder transformer.
+_C.MASK.DECODER_EMBED_DIM = 512
+# Base embedding dimension for the decoder transformer.
+_C.MASK.DECODER_SEP_POS_EMBED = False
+# Use a KV kernel in decoder?
+_C.MASK.DEC_KV_KERNEL = []
+# Use a KV stride in decoder?
+_C.MASK.DEC_KV_STRIDE = []
+# The depths of features which are inputs of the prediction head.
+_C.MASK.PRETRAIN_DEPTH = [15]
+# The type of Masked pretraining prediction head.
+# Can be "separate", "separate_xformer".
+_C.MASK.HEAD_TYPE = "separate"
+# The depth of MAE's decoder
+_C.MASK.DECODER_DEPTH = 0
+# The weight of HOG target loss.
+_C.MASK.PRED_HOG = False
+# Reversible Configs
+_C.MVIT.REV = CfgNode()
+# Enable Reversible Model
+_C.MVIT.REV.ENABLE = False
+# Method to fuse the reversible paths
+# see :class: `TwoStreamFusion` for all the options
+_C.MVIT.REV.RESPATH_FUSE = "concat"
+# Layers to buffer activations at
+# (at least Q-pooling layers needed)
+_C.MVIT.REV.BUFFER_LAYERS = []
+# 'conv' or 'max' operator for the respath in Qpooling
+_C.MVIT.REV.RES_PATH = "conv"
+# Method to merge hidden states before Qpoolinglayers
+_C.MVIT.REV.PRE_Q_FUSION = "avg"
+# -----------------------------------------------------------------------------
+# SlowFast options
+# -----------------------------------------------------------------------------
+_C.SLOWFAST = CfgNode()
+# Corresponds to the inverse of the channel reduction ratio, $\beta$ between
+# the Slow and Fast pathways.
+_C.SLOWFAST.BETA_INV = 8
+# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and
+# Fast pathways.
+_C.SLOWFAST.ALPHA = 8
+# Ratio of channel dimensions between the Slow and Fast pathways.
+_C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2
+# Kernel dimension used for fusing information from Fast pathway to Slow
+# pathway.
+_C.SLOWFAST.FUSION_KERNEL_SZ = 5
+# -----------------------------------------------------------------------------
+# Data options
+# -----------------------------------------------------------------------------
+_C.DATA = CfgNode()
+# The path to the data directory.
+_C.DATA.PATH_TO_DATA_DIR = ""
+# The separator used between path and label.
+_C.DATA.PATH_LABEL_SEPARATOR = " "
+# Video path prefix if any.
+_C.DATA.PATH_PREFIX = ""
+# The number of frames of the input clip.
+_C.DATA.NUM_FRAMES = 8
+# The video sampling rate of the input clip.
+_C.DATA.SAMPLING_RATE = 8
+# Eigenvalues for PCA jittering. Note PCA is RGB based.
+_C.DATA.TRAIN_PCA_EIGVAL = [0.225, 0.224, 0.229]
+# Eigenvectors for PCA jittering.
+_C.DATA.TRAIN_PCA_EIGVEC = [
+    [-0.5675, 0.7192, 0.4009],
+    [-0.5808, -0.0045, -0.8140],
+    [-0.5836, -0.6948, 0.4203],
+]
+# If a imdb have been dumpped to a local file with the following format:
+# `{"im_path": im_path, "class": cont_id}`
+# then we can skip the construction of imdb and load it from the local file.
+_C.DATA.PATH_TO_PRELOAD_IMDB = ""
+# The mean value of the video raw pixels across the R G B channels.
+_C.DATA.MEAN = [0.45, 0.45, 0.45]
+# List of input frame channel dimensions.
+_C.DATA.INPUT_CHANNEL_NUM = [3, 3]
+# The std value of the video raw pixels across the R G B channels.
+_C.DATA.STD = [0.225, 0.225, 0.225]
+# The spatial augmentation jitter scales for training.
+_C.DATA.TRAIN_JITTER_SCALES = [256, 320]
+# The relative scale range of Inception-style area based random resizing augmentation.
+# If this is provided, DATA.TRAIN_JITTER_SCALES above is ignored.
+_C.DATA.TRAIN_JITTER_SCALES_RELATIVE = []
+# The relative aspect ratio range of Inception-style area based random resizing
+# augmentation.
+_C.DATA.TRAIN_JITTER_ASPECT_RELATIVE = []
+# If True, perform stride length uniform temporal sampling.
+_C.DATA.USE_OFFSET_SAMPLING = False
+# Whether to apply motion shift for augmentation.
+_C.DATA.TRAIN_JITTER_MOTION_SHIFT = False
+# The spatial crop size for training.
+_C.DATA.TRAIN_CROP_SIZE = 224
+# The spatial crop size for testing.
+_C.DATA.TEST_CROP_SIZE = 256
+# Input videos may has different fps, convert it to the target video fps before
+# frame sampling.
+_C.DATA.TARGET_FPS = 30
+# JITTER TARGET_FPS by +- this number randomly
+_C.DATA.TRAIN_JITTER_FPS = 0.0
+# Decoding backend, options include `pyav` or `torchvision`
+_C.DATA.DECODING_BACKEND = "torchvision"
+# Decoding resize to short size (set to native size for best speed)
+_C.DATA.DECODING_SHORT_SIZE = 256
+# if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a
+# reciprocal to get the scale. If False, take a uniform sample from
+# [min_scale, max_scale].
+_C.DATA.INV_UNIFORM_SAMPLE = False
+# If True, perform random horizontal flip on the video frames during training.
+_C.DATA.RANDOM_FLIP = True
+# If True, calculdate the map as metric.
+_C.DATA.MULTI_LABEL = False
+# Method to perform the ensemble, options include "sum" and "max".
+_C.DATA.ENSEMBLE_METHOD = "sum"
+# If True, revert the default input channel (RBG <-> BGR).
+_C.DATA.REVERSE_INPUT_CHANNEL = False
+# how many samples (=clips) to decode from a single video
+_C.DATA.TRAIN_CROP_NUM_TEMPORAL = 1
+# how many spatial samples to crop from a single clip
+_C.DATA.TRAIN_CROP_NUM_SPATIAL = 1
+# color random percentage for grayscale conversion
+_C.DATA.COLOR_RND_GRAYSCALE = 0.0
+# loader can read .csv file in chunks of this chunk size
+_C.DATA.LOADER_CHUNK_SIZE = 0
+# if LOADER_CHUNK_SIZE > 0, define overall length of .csv file
+_C.DATA.LOADER_CHUNK_OVERALL_SIZE = 0
+# for chunked reading, dataloader can skip rows in (large)
+# training csv file
+_C.DATA.SKIP_ROWS = 0
+# The separator used between path and label.
+_C.DATA.PATH_LABEL_SEPARATOR = " "
+# augmentation probability to convert raw decoded video to
+# grayscale temporal difference
+_C.DATA.TIME_DIFF_PROB = 0.0
+# Apply SSL-based SimCLR / MoCo v1/v2 color augmentations,
+#  with params below
+_C.DATA.SSL_COLOR_JITTER = False
+# color jitter percentage for brightness, contrast, saturation
+_C.DATA.SSL_COLOR_BRI_CON_SAT = [0.4, 0.4, 0.4]
+# color jitter percentage for hue
+_C.DATA.SSL_COLOR_HUE = 0.1
+# SimCLR / MoCo v2 augmentations on/off
+_C.DATA.SSL_MOCOV2_AUG = False
+# SimCLR / MoCo v2 blur augmentation minimum gaussian sigma
+_C.DATA.SSL_BLUR_SIGMA_MIN = [0.0, 0.1]
+# SimCLR / MoCo v2 blur augmentation maximum gaussian sigma
+_C.DATA.SSL_BLUR_SIGMA_MAX = [0.0, 2.0]
+# If combine train/val split as training for in21k
+_C.DATA.IN22K_TRAINVAL = False
+# If not None, use IN1k as val split when training in21k
+_C.DATA.IN22k_VAL_IN1K = ""
+# Large resolution models may use different crop ratios
+_C.DATA.IN_VAL_CROP_RATIO = 0.875  # 224/256 = 0.875
+# don't use real video for kinetics.py
+_C.DATA.DUMMY_LOAD = False
+# ---------------------------------------------------------------------------- #
+# Optimizer options
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CfgNode()
+# Base learning rate.
+_C.SOLVER.BASE_LR = 0.1
+# Learning rate policy (see utils/lr_policy.py for options and examples).
+_C.SOLVER.LR_POLICY = "cosine"
+# Final learning rates for 'cosine' policy.
+_C.SOLVER.COSINE_END_LR = 0.0
+# Exponential decay factor.
+_C.SOLVER.GAMMA = 0.1
+# Step size for 'exp' and 'cos' policies (in epochs).
+_C.SOLVER.STEP_SIZE = 1
+# Steps for 'steps_' policies (in epochs).
+_C.SOLVER.STEPS = []
+# Learning rates for 'steps_' policies.
+_C.SOLVER.LRS = []
+# Maximal number of epochs.
+_C.SOLVER.MAX_EPOCH = 300
+# Momentum.
+_C.SOLVER.MOMENTUM = 0.9
+# Momentum dampening.
+_C.SOLVER.DAMPENING = 0.0
+# Nesterov momentum.
+_C.SOLVER.NESTEROV = True
+# L2 regularization.
+_C.SOLVER.WEIGHT_DECAY = 1e-4
+# Start the warm up from SOLVER.BASE_LR * SOLVER.WARMUP_FACTOR.
+_C.SOLVER.WARMUP_FACTOR = 0.1
+# Gradually warm up the SOLVER.BASE_LR over this number of epochs.
+_C.SOLVER.WARMUP_EPOCHS = 0.0
+# The start learning rate of the warm up.
+_C.SOLVER.WARMUP_START_LR = 0.01
+# Optimization method.
+_C.SOLVER.OPTIMIZING_METHOD = "sgd"
+# Base learning rate is linearly scaled with NUM_SHARDS.
+_C.SOLVER.BASE_LR_SCALE_NUM_SHARDS = False
+# If True, start from the peak cosine learning rate after warm up.
+_C.SOLVER.COSINE_AFTER_WARMUP = False
+# If True, perform no weight decay on parameter with one dimension (bias term, etc).
+_C.SOLVER.ZERO_WD_1D_PARAM = False
+# Clip gradient at this value before optimizer update
+_C.SOLVER.CLIP_GRAD_VAL = None
+# Clip gradient at this norm before optimizer update
+_C.SOLVER.CLIP_GRAD_L2NORM = None
+# LARS optimizer
+_C.SOLVER.LARS_ON = False
+# The layer-wise decay of learning rate. Set to 1. to disable.
+_C.SOLVER.LAYER_DECAY = 1.0
+# Adam's beta
+_C.SOLVER.BETAS = (0.9, 0.999)
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# The name of the current task; e.g. "ssl"/"sl" for (self)supervised learning
+_C.TASK = ""
+# Number of GPUs to use (applies to both training and testing).
+_C.NUM_GPUS = 1
+# Number of machine to use for the job.
+_C.NUM_SHARDS = 1
+# The index of the current machine.
+_C.SHARD_ID = 0
+# Output basedir.
+_C.OUTPUT_DIR = "."
+# Note that non-determinism may still be present due to non-deterministic
+# operator implementations in GPU operator libraries.
+_C.RNG_SEED = 1
+# Log period in iters.
+_C.LOG_PERIOD = 10
+# If True, log the model info.
+_C.LOG_MODEL_INFO = True
+# Distributed backend.
+_C.DIST_BACKEND = "nccl"
+# ---------------------------------------------------------------------------- #
+# Benchmark options
+# ---------------------------------------------------------------------------- #
+_C.BENCHMARK = CfgNode()
+# Number of epochs for data loading benchmark.
+_C.BENCHMARK.NUM_EPOCHS = 5
+# Log period in iters for data loading benchmark.
+_C.BENCHMARK.LOG_PERIOD = 100
+# If True, shuffle dataloader for epoch during benchmark.
+_C.BENCHMARK.SHUFFLE = True
+# ---------------------------------------------------------------------------- #
+# Common train/test data loader options
+# ---------------------------------------------------------------------------- #
+_C.DATA_LOADER = CfgNode()
+# Number of data loader workers per training process.
+_C.DATA_LOADER.NUM_WORKERS = 8
+# Load data to pinned host memory.
+_C.DATA_LOADER.PIN_MEMORY = True
+# Enable multi thread decoding.
+_C.DATA_LOADER.ENABLE_MULTI_THREAD_DECODE = False
+# ---------------------------------------------------------------------------- #
+# Detection options.
+# ---------------------------------------------------------------------------- #
+_C.DETECTION = CfgNode()
+# Whether enable video detection.
+_C.DETECTION.ENABLE = False
+# Aligned version of RoI. More details can be found at slowfast/models/head_helper.py
+_C.DETECTION.ALIGNED = True
+# Spatial scale factor.
+_C.DETECTION.SPATIAL_SCALE_FACTOR = 16
+# RoI tranformation resolution.
+_C.DETECTION.ROI_XFORM_RESOLUTION = 7
+# -----------------------------------------------------------------------------
+# AVA Dataset options
+# -----------------------------------------------------------------------------
+_C.AVA = CfgNode()
+# Directory path of frames.
+_C.AVA.FRAME_DIR = "/mnt/fair-flash3-east/ava_trainval_frames.img/"
+# Directory path for files of frame lists.
+_C.AVA.FRAME_LIST_DIR = (
+    "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/"
+)
+# Directory path for annotation files.
+_C.AVA.ANNOTATION_DIR = (
+    "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/"
+)
+# Filenames of training samples list files.
+_C.AVA.TRAIN_LISTS = ["train.csv"]
+# Filenames of test samples list files.
+_C.AVA.TEST_LISTS = ["val.csv"]
+# Filenames of box list files for training. Note that we assume files which
+# contains predicted boxes will have a suffix "predicted_boxes" in the
+# filename.
+_C.AVA.TRAIN_GT_BOX_LISTS = ["ava_train_v2.2.csv"]
+_C.AVA.TRAIN_PREDICT_BOX_LISTS = []
+# Filenames of box list files for test.
+_C.AVA.TEST_PREDICT_BOX_LISTS = ["ava_val_predicted_boxes.csv"]
+# This option controls the score threshold for the predicted boxes to use.
+_C.AVA.DETECTION_SCORE_THRESH = 0.9
+# If use BGR as the format of input frames.
+_C.AVA.BGR = False
+# Training augmentation parameters
+# Whether to use color augmentation method.
+_C.AVA.TRAIN_USE_COLOR_AUGMENTATION = False
+# Whether to only use PCA jitter augmentation when using color augmentation
+# method (otherwise combine with color jitter method).
+_C.AVA.TRAIN_PCA_JITTER_ONLY = True
+# Whether to do horizontal flipping during test.
+_C.AVA.TEST_FORCE_FLIP = False
+# Whether to use full test set for validation split.
+_C.AVA.FULL_TEST_ON_VAL = False
+# The name of the file to the ava label map.
+_C.AVA.LABEL_MAP_FILE = "ava_action_list_v2.2_for_activitynet_2019.pbtxt"
+# The name of the file to the ava exclusion.
+_C.AVA.EXCLUSION_FILE = "ava_val_excluded_timestamps_v2.2.csv"
+# The name of the file to the ava groundtruth.
+_C.AVA.GROUNDTRUTH_FILE = "ava_val_v2.2.csv"
+# Backend to process image, includes `pytorch` and `cv2`.
+_C.AVA.IMG_PROC_BACKEND = "cv2"
+# ---------------------------------------------------------------------------- #
+# Multigrid training options
+# See https://arxiv.org/abs/1912.00998 for details about multigrid training.
+# ---------------------------------------------------------------------------- #
+_C.MULTIGRID = CfgNode()
+# Multigrid training allows us to train for more epochs with fewer iterations.
+# This hyperparameter specifies how many times more epochs to train.
+# The default setting in paper trains for 1.5x more epochs than baseline.
+_C.MULTIGRID.EPOCH_FACTOR = 1.5
+# Enable short cycles.
+_C.MULTIGRID.SHORT_CYCLE = False
+# Short cycle additional spatial dimensions relative to the default crop size.
+_C.MULTIGRID.SHORT_CYCLE_FACTORS = [0.5, 0.5**0.5]
+_C.MULTIGRID.LONG_CYCLE = False
+# (Temporal, Spatial) dimensions relative to the default shape.
+_C.MULTIGRID.LONG_CYCLE_FACTORS = [
+    (0.25, 0.5**0.5),
+    (0.5, 0.5**0.5),
+    (0.5, 1),
+    (1, 1),
+]
+# While a standard BN computes stats across all examples in a GPU,
+# for multigrid training we fix the number of clips to compute BN stats on.
+# See https://arxiv.org/abs/1912.00998 for details.
+_C.MULTIGRID.BN_BASE_SIZE = 8
+# Multigrid training epochs are not proportional to actual training time or
+# computations, so _C.TRAIN.EVAL_PERIOD leads to too frequent or rare
+# evaluation. We use a multigrid-specific rule to determine when to evaluate:
+# This hyperparameter defines how many times to evaluate a model per long
+# cycle shape.
+_C.MULTIGRID.EVAL_FREQ = 3
+# No need to specify; Set automatically and used as global variables.
+_C.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = 0
+_C.MULTIGRID.DEFAULT_B = 0
+_C.MULTIGRID.DEFAULT_T = 0
+_C.MULTIGRID.DEFAULT_S = 0
+# -----------------------------------------------------------------------------
+# Tensorboard Visualization Options
+# -----------------------------------------------------------------------------
+_C.TENSORBOARD = CfgNode()
+# Log to summary writer, this will automatically.
+# log loss, lr and metrics during train/eval.
+_C.TENSORBOARD.ENABLE = False
+# Provide path to prediction results for visualization.
+# This is a pickle file of [prediction_tensor, label_tensor]
+_C.TENSORBOARD.PREDICTIONS_PATH = ""
+# Path to directory for tensorboard logs.
+# Default to to cfg.OUTPUT_DIR/runs-{cfg.TRAIN.DATASET}.
+_C.TENSORBOARD.LOG_DIR = ""
+# Path to a json file providing class_name - id mapping
+# in the format {"class_name1": id1, "class_name2": id2, ...}.
+# This file must be provided to enable plotting confusion matrix
+# by a subset or parent categories.
+_C.TENSORBOARD.CLASS_NAMES_PATH = ""
+# Path to a json file for categories -> classes mapping
+# in the format {"parent_class": ["child_class1", "child_class2",...], ...}.
+_C.TENSORBOARD.CATEGORIES_PATH = ""
+# Config for confusion matrices visualization.
+_C.TENSORBOARD.CONFUSION_MATRIX = CfgNode()
+# Visualize confusion matrix.
+_C.TENSORBOARD.CONFUSION_MATRIX.ENABLE = False
+# Figure size of the confusion matrices plotted.
+_C.TENSORBOARD.CONFUSION_MATRIX.FIGSIZE = [8, 8]
+# Path to a subset of categories to visualize.
+# File contains class names separated by newline characters.
+_C.TENSORBOARD.CONFUSION_MATRIX.SUBSET_PATH = ""
+# Config for histogram visualization.
+_C.TENSORBOARD.HISTOGRAM = CfgNode()
+# Visualize histograms.
+_C.TENSORBOARD.HISTOGRAM.ENABLE = False
+# Path to a subset of classes to plot histograms.
+# Class names must be separated by newline characters.
+_C.TENSORBOARD.HISTOGRAM.SUBSET_PATH = ""
+# Visualize top-k most predicted classes on histograms for each
+# chosen true label.
+_C.TENSORBOARD.HISTOGRAM.TOPK = 10
+# Figure size of the histograms plotted.
+_C.TENSORBOARD.HISTOGRAM.FIGSIZE = [8, 8]
+# Config for layers' weights and activations visualization.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.MODEL_VIS = CfgNode()
+# If False, skip model visualization.
+_C.TENSORBOARD.MODEL_VIS.ENABLE = False
+# If False, skip visualizing model weights.
+_C.TENSORBOARD.MODEL_VIS.MODEL_WEIGHTS = False
+# If False, skip visualizing model activations.
+_C.TENSORBOARD.MODEL_VIS.ACTIVATIONS = False
+# If False, skip visualizing input videos.
+_C.TENSORBOARD.MODEL_VIS.INPUT_VIDEO = False
+# List of strings containing data about layer names and their indexing to
+# visualize weights and activations for. The indexing is meant for
+# choosing a subset of activations outputed by a layer for visualization.
+# If indexing is not specified, visualize all activations outputed by the layer.
+# For each string, layer name and indexing is separated by whitespaces.
+# e.g.: [layer1 1,2;1,2, layer2, layer3 150,151;3,4]; this means for each array `arr`
+# along the batch dimension in `layer1`, we take arr[[1, 2], [1, 2]]
+_C.TENSORBOARD.MODEL_VIS.LAYER_LIST = []
+# Top-k predictions to plot on videos
+_C.TENSORBOARD.MODEL_VIS.TOPK_PREDS = 1
+# Colormap to for text boxes and bounding boxes colors
+_C.TENSORBOARD.MODEL_VIS.COLORMAP = "Pastel2"
+# Config for visualization video inputs with Grad-CAM.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM = CfgNode()
+# Whether to run visualization using Grad-CAM technique.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE = True
+# CNN layers to use for Grad-CAM. The number of layers must be equal to
+# number of pathway(s).
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST = []
+# If True, visualize Grad-CAM using true labels for each instances.
+# If False, use the highest predicted class.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.USE_TRUE_LABEL = False
+# Colormap to for text boxes and bounding boxes colors
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.COLORMAP = "viridis"
+# Config for visualization for wrong prediction visualization.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.WRONG_PRED_VIS = CfgNode()
+_C.TENSORBOARD.WRONG_PRED_VIS.ENABLE = False
+# Folder tag to origanize model eval videos under.
+_C.TENSORBOARD.WRONG_PRED_VIS.TAG = "Incorrectly classified videos."
+# Subset of labels to visualize. Only wrong predictions with true labels
+# within this subset is visualized.
+_C.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH = ""
+# ---------------------------------------------------------------------------- #
+# Demo options
+# ---------------------------------------------------------------------------- #
+_C.DEMO = CfgNode()
+# Run model in DEMO mode.
+_C.DEMO.ENABLE = False
+# Path to a json file providing class_name - id mapping
+# in the format {"class_name1": id1, "class_name2": id2, ...}.
+_C.DEMO.LABEL_FILE_PATH = ""
+# Specify a camera device as input. This will be prioritized
+# over input video if set.
+# If -1, use input video instead.
+_C.DEMO.WEBCAM = -1
+# Path to input video for demo.
+_C.DEMO.INPUT_VIDEO = ""
+# Custom width for reading input video data.
+_C.DEMO.DISPLAY_WIDTH = 0
+# Custom height for reading input video data.
+_C.DEMO.DISPLAY_HEIGHT = 0
+# Path to Detectron2 object detection model configuration,
+# only used for detection tasks.
+_C.DEMO.DETECTRON2_CFG = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
+# Path to Detectron2 object detection model pre-trained weights.
+_C.DEMO.DETECTRON2_WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl"
+# Threshold for choosing predicted bounding boxes by Detectron2.
+_C.DEMO.DETECTRON2_THRESH = 0.9
+# Number of overlapping frames between 2 consecutive clips.
+# Increase this number for more frequent action predictions.
+# The number of overlapping frames cannot be larger than
+# half of the sequence length `cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE`
+_C.DEMO.BUFFER_SIZE = 0
+# If specified, the visualized outputs will be written this a video file of
+# this path. Otherwise, the visualized outputs will be displayed in a window.
+_C.DEMO.OUTPUT_FILE = ""
+# Frames per second rate for writing to output video file.
+# If not set (-1), use fps rate from input file.
+_C.DEMO.OUTPUT_FPS = -1
+# Input format from demo video reader ("RGB" or "BGR").
+_C.DEMO.INPUT_FORMAT = "BGR"
+# Draw visualization frames in [keyframe_idx - CLIP_VIS_SIZE, keyframe_idx + CLIP_VIS_SIZE] inclusively.
+_C.DEMO.CLIP_VIS_SIZE = 10
+# Number of processes to run video visualizer.
+_C.DEMO.NUM_VIS_INSTANCES = 2
+# Path to pre-computed predicted boxes
+_C.DEMO.PREDS_BOXES = ""
+# Whether to run in with multi-threaded video reader.
+_C.DEMO.THREAD_ENABLE = False
+# Take one clip for every `DEMO.NUM_CLIPS_SKIP` + 1 for prediction and visualization.
+# This is used for fast demo speed by reducing the prediction/visualiztion frequency.
+# If -1, take the most recent read clip for visualization. This mode is only supported
+# if `DEMO.THREAD_ENABLE` is set to True.
+_C.DEMO.NUM_CLIPS_SKIP = 0
+# Path to ground-truth boxes and labels (optional)
+_C.DEMO.GT_BOXES = ""
+# The starting second of the video w.r.t bounding boxes file.
+_C.DEMO.STARTING_SECOND = 900
+# Frames per second of the input video/folder of images.
+_C.DEMO.FPS = 30
+# Visualize with top-k predictions or predictions above certain threshold(s).
+# Option: {"thres", "top-k"}
+_C.DEMO.VIS_MODE = "thres"
+# Threshold for common class names.
+_C.DEMO.COMMON_CLASS_THRES = 0.7
+# Theshold for uncommon class names. This will not be
+# used if `_C.DEMO.COMMON_CLASS_NAMES` is empty.
+_C.DEMO.UNCOMMON_CLASS_THRES = 0.3
+# This is chosen based on distribution of examples in
+# each classes in AVA dataset.
+_C.DEMO.COMMON_CLASS_NAMES = [
+    "watch (a person)",
+    "talk to (e.g., self, a person, a group)",
+    "listen to (a person)",
+    "touch (an object)",
+    "carry/hold (an object)",
+    "walk",
+    "sit",
+    "lie/sleep",
+    "bend/bow (at the waist)",
+]
+# Slow-motion rate for the visualization. The visualized portions of the
+# video will be played `_C.DEMO.SLOWMO` times slower than usual speed.
+_C.DEMO.SLOWMO = 1
+def assert_and_infer_cfg(cfg):
+    # BN assertions.
+    if cfg.BN.USE_PRECISE_STATS:
+        assert cfg.BN.NUM_BATCHES_PRECISE >= 0
+    # TRAIN assertions.
+    assert cfg.TRAIN.CHECKPOINT_TYPE in ["pytorch", "caffe2"]
+    assert cfg.NUM_GPUS == 0 or cfg.TRAIN.BATCH_SIZE % cfg.NUM_GPUS == 0
+    # TEST assertions.
+    assert cfg.TEST.CHECKPOINT_TYPE in ["pytorch", "caffe2"]
+    assert cfg.NUM_GPUS == 0 or cfg.TEST.BATCH_SIZE % cfg.NUM_GPUS == 0
+    # RESNET assertions.
+    assert cfg.RESNET.NUM_GROUPS > 0
+    assert cfg.RESNET.WIDTH_PER_GROUP > 0
+    assert cfg.RESNET.WIDTH_PER_GROUP % cfg.RESNET.NUM_GROUPS == 0
+    # Execute LR scaling by num_shards.
+    if cfg.SOLVER.BASE_LR_SCALE_NUM_SHARDS:
+        cfg.SOLVER.BASE_LR *= cfg.NUM_SHARDS
+        cfg.SOLVER.WARMUP_START_LR *= cfg.NUM_SHARDS
+        cfg.SOLVER.COSINE_END_LR *= cfg.NUM_SHARDS
+    # General assertions.
+    assert cfg.SHARD_ID < cfg.NUM_SHARDS
+    return cfg
+def get_cfg():
+    return _C.clone()

helpers/head.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""ResNe(X)t Head helper."""
+import torch.nn as nn
+class X3DHead(nn.Module):
+    """
+    X3D head.
+    This layer performs a fully-connected projection during training, when the
+    input size is 1x1x1. It performs a convolutional projection during testing
+    when the input size is larger than 1x1x1. If the inputs are from multiple
+    different pathways, the inputs will be concatenated after pooling.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_inner,
+        dim_out,
+        num_classes,
+        pool_size,
+        dropout_rate=0.0,
+        act_func="softmax",
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+        bn_lin5_on=False,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these
+            arguments.
+        X3DHead takes a 5-dim feature tensor (BxCxTxHxW) as input.
+        Args:
+            dim_in (float): the channel dimension C of the input.
+            num_classes (int): the channel dimensions of the output.
+            pool_size (float): a single entry list of kernel size for
+                spatiotemporal pooling for the TxHxW dimensions.
+            dropout_rate (float): dropout rate. If equal to 0.0, perform no
+                dropout.
+            act_func (string): activation function to use. 'softmax': applies
+                softmax on the output. 'sigmoid': applies sigmoid on the output.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            bn_lin5_on (bool): if True, perform normalization on the features
+                before the classifier.
+        """
+        super(X3DHead, self).__init__()
+        self.pool_size = pool_size
+        self.dropout_rate = dropout_rate
+        self.num_classes = num_classes
+        self.act_func = act_func
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        self.inplace_relu = inplace_relu
+        self.bn_lin5_on = bn_lin5_on
+        self._construct_head(dim_in, dim_inner, dim_out, norm_module)
+    def _construct_head(self, dim_in, dim_inner, dim_out, norm_module):
+        self.conv_5 = nn.Conv3d(
+            dim_in,
+            dim_inner,
+            kernel_size=(1, 1, 1),
+            stride=(1, 1, 1),
+            padding=(0, 0, 0),
+            bias=False,
+        )
+        self.conv_5_bn = norm_module(
+            num_features=dim_inner, eps=self.eps, momentum=self.bn_mmt
+        )
+        self.conv_5_relu = nn.ReLU(self.inplace_relu)
+        if self.pool_size is None:
+            self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool = nn.AvgPool3d(self.pool_size, stride=1)
+        self.lin_5 = nn.Conv3d(
+            dim_inner,
+            dim_out,
+            kernel_size=(1, 1, 1),
+            stride=(1, 1, 1),
+            padding=(0, 0, 0),
+            bias=False,
+        )
+        if self.bn_lin5_on:
+            self.lin_5_bn = norm_module(
+                num_features=dim_out, eps=self.eps, momentum=self.bn_mmt
+            )
+        self.lin_5_relu = nn.ReLU(self.inplace_relu)
+        if self.dropout_rate > 0.0:
+            self.dropout = nn.Dropout(self.dropout_rate)
+        # Perform FC in a fully convolutional manner. The FC layer will be
+        # initialized with a different std comparing to convolutional layers.
+        self.projection = nn.Linear(dim_out, self.num_classes, bias=True)
+        # Softmax for evaluation and testing.
+        if self.act_func == "softmax":
+            self.act = nn.Softmax(dim=4)
+        elif self.act_func == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            raise NotImplementedError(
+                "{} is not supported as an activation" "function.".format(
+                    self.act_func)
+            )
+    def forward(self, inputs):
+        # In its current design the X3D head is only useable for a single
+        # pathway input.
+        assert len(inputs) == 1, "Input tensor does not contain 1 pathway"
+        x = self.conv_5(inputs[0])
+        x = self.conv_5_bn(x)
+        x = self.conv_5_relu(x)
+        x = self.avg_pool(x)
+        x = self.lin_5(x)
+        if self.bn_lin5_on:
+            x = self.lin_5_bn(x)
+        x = self.lin_5_relu(x)
+        # (N, C, T, H, W) -> (N, T, H, W, C).
+        x = x.permute((0, 2, 3, 4, 1))
+        # Perform dropout.
+        if hasattr(self, "dropout"):
+            x = self.dropout(x)
+        x = self.projection(x)
+        # Performs fully convlutional inference.
+        if not self.training:
+            x = self.act(x)
+            x = x.mean([1, 2, 3])
+        x = x.view(x.shape[0], -1)
+        return x

helpers/norm.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""BatchNorm (BN) utility functions and custom batch-size BN implementations"""
+from functools import partial
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.batch_norm import NaiveSyncBatchNorm3d
+def get_norm(cfg):
+    """
+    Args:
+        cfg (CfgNode): model building configs, details are in the comments of
+            the config file.
+    Returns:
+        nn.Module: the normalization layer.
+    """
+    if cfg.BN.NORM_TYPE in {"batchnorm", "sync_batchnorm_apex"}:
+        return nn.BatchNorm3d
+    elif cfg.BN.NORM_TYPE == "sub_batchnorm":
+        return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS)
+    elif cfg.BN.NORM_TYPE == "sync_batchnorm":
+        return partial(
+            NaiveSyncBatchNorm3d,
+            num_sync_devices=cfg.BN.NUM_SYNC_DEVICES,
+            global_sync=cfg.BN.GLOBAL_SYNC,
+        )
+    else:
+        raise NotImplementedError(
+            "Norm type {} is not supported".format(cfg.BN.NORM_TYPE)
+        )
+class SubBatchNorm3d(nn.Module):
+    """
+    The standard BN layer computes stats across all examples in a GPU. In some
+    cases it is desirable to compute stats across only a subset of examples
+    (e.g., in multigrid training https://arxiv.org/abs/1912.00998).
+    SubBatchNorm3d splits the batch dimension into N splits, and run BN on
+    each of them separately (so that the stats are computed on each subset of
+    examples (1/N of batch) independently. During evaluation, it aggregates
+    the stats from all splits into one BN.
+    """
+    def __init__(self, num_splits, **args):
+        """
+        Args:
+            num_splits (int): number of splits.
+            args (list): other arguments.
+        """
+        super(SubBatchNorm3d, self).__init__()
+        self.num_splits = num_splits
+        num_features = args["num_features"]
+        # Keep only one set of weight and bias.
+        if args.get("affine", True):
+            self.affine = True
+            args["affine"] = False
+            self.weight = torch.nn.Parameter(torch.ones(num_features))
+            self.bias = torch.nn.Parameter(torch.zeros(num_features))
+        else:
+            self.affine = False
+        self.bn = nn.BatchNorm3d(**args)
+        args["num_features"] = num_features * num_splits
+        self.split_bn = nn.BatchNorm3d(**args)
+    def _get_aggregated_mean_std(self, means, stds, n):
+        """
+        Calculate the aggregated mean and stds.
+        Args:
+            means (tensor): mean values.
+            stds (tensor): standard deviations.
+            n (int): number of sets of means and stds.
+        """
+        mean = means.view(n, -1).sum(0) / n
+        std = (
+            stds.view(n, -1).sum(0) / n
+            + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n
+        )
+        return mean.detach(), std.detach()
+    def aggregate_stats(self):
+        """
+        Synchronize running_mean, and running_var. Call this before eval.
+        """
+        if self.split_bn.track_running_stats:
+            (
+                self.bn.running_mean.data,
+                self.bn.running_var.data,
+            ) = self._get_aggregated_mean_std(
+                self.split_bn.running_mean,
+                self.split_bn.running_var,
+                self.num_splits,
+            )
+    def forward(self, x):
+        if self.training:
+            n, c, t, h, w = x.shape
+            x = x.view(n // self.num_splits, c * self.num_splits, t, h, w)
+            x = self.split_bn(x)
+            x = x.view(n, c, t, h, w)
+        else:
+            x = self.bn(x)
+        if self.affine:
+            x = x * self.weight.view((-1, 1, 1, 1))
+            x = x + self.bias.view((-1, 1, 1, 1))
+        return x

helpers/resnet.py ADDED Viewed

	@@ -0,0 +1,927 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Video models."""
+import torch
+import torch.nn as nn
+from pytorchvideo.layers.swish import Swish
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """
+    Stochastic Depth per sample.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    mask.floor_()  # binarize
+    output = x.div(keep_prob) * mask
+    return output
+class Nonlocal(nn.Module):
+    """
+    Builds Non-local Neural Networks as a generic family of building
+    blocks for capturing long-range dependencies. Non-local Network
+    computes the response at a position as a weighted sum of the
+    features at all positions. This building block can be plugged into
+    many computer vision architectures.
+    More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
+    """
+    def __init__(
+        self,
+        dim,
+        dim_inner,
+        pool_size=None,
+        instantiation="softmax",
+        zero_init_final_conv=False,
+        zero_init_final_norm=True,
+        norm_eps=1e-5,
+        norm_momentum=0.1,
+        norm_module=nn.BatchNorm3d,
+    ):
+        """
+        Args:
+            dim (int): number of dimension for the input.
+            dim_inner (int): number of dimension inside of the Non-local block.
+            pool_size (list): the kernel size of spatial temporal pooling,
+                temporal pool kernel size, spatial pool kernel size, spatial
+                pool kernel size in order. By default pool_size is None,
+                then there would be no pooling used.
+            instantiation (string): supports two different instantiation method:
+                "dot_product": normalizing correlation matrix with L2.
+                "softmax": normalizing correlation matrix with Softmax.
+            zero_init_final_conv (bool): If true, zero initializing the final
+                convolution of the Non-local block.
+            zero_init_final_norm (bool):
+                If true, zero initializing the final batch norm of the Non-local
+                block.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(Nonlocal, self).__init__()
+        self.dim = dim
+        self.dim_inner = dim_inner
+        self.pool_size = pool_size
+        self.instantiation = instantiation
+        self.use_pool = (
+            False if pool_size is None else any((size > 1 for size in pool_size))
+        )
+        self.norm_eps = norm_eps
+        self.norm_momentum = norm_momentum
+        self._construct_nonlocal(
+            zero_init_final_conv, zero_init_final_norm, norm_module
+        )
+    def _construct_nonlocal(
+        self, zero_init_final_conv, zero_init_final_norm, norm_module
+    ):
+        # Three convolution heads: theta, phi, and g.
+        self.conv_theta = nn.Conv3d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+        self.conv_phi = nn.Conv3d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+        self.conv_g = nn.Conv3d(
+            self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
+        )
+        # Final convolution output.
+        self.conv_out = nn.Conv3d(
+            self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
+        )
+        # Zero initializing the final convolution output.
+        self.conv_out.zero_init = zero_init_final_conv
+        # TODO: change the name to `norm`
+        self.bn = norm_module(
+            num_features=self.dim,
+            eps=self.norm_eps,
+            momentum=self.norm_momentum,
+        )
+        # Zero initializing the final bn.
+        self.bn.transform_final_bn = zero_init_final_norm
+        # Optional to add the spatial-temporal pooling.
+        if self.use_pool:
+            self.pool = nn.MaxPool3d(
+                kernel_size=self.pool_size,
+                stride=self.pool_size,
+                padding=[0, 0, 0],
+            )
+    def forward(self, x):
+        x_identity = x
+        N, C, T, H, W = x.size()
+        theta = self.conv_theta(x)
+        # Perform temporal-spatial pooling to reduce the computation.
+        if self.use_pool:
+            x = self.pool(x)
+        phi = self.conv_phi(x)
+        g = self.conv_g(x)
+        theta = theta.view(N, self.dim_inner, -1)
+        phi = phi.view(N, self.dim_inner, -1)
+        g = g.view(N, self.dim_inner, -1)
+        # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW).
+        theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
+        # For original Non-local paper, there are two main ways to normalize
+        # the affinity tensor:
+        #   1) Softmax normalization (norm on exp).
+        #   2) dot_product normalization.
+        if self.instantiation == "softmax":
+            # Normalizing the affinity tensor theta_phi before softmax.
+            theta_phi = theta_phi * (self.dim_inner**-0.5)
+            theta_phi = nn.functional.softmax(theta_phi, dim=2)
+        elif self.instantiation == "dot_product":
+            spatial_temporal_dim = theta_phi.shape[2]
+            theta_phi = theta_phi / spatial_temporal_dim
+        else:
+            raise NotImplementedError("Unknown norm type {}".format(self.instantiation))
+        # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
+        theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
+        # (N, C, TxHxW) => (N, C, T, H, W).
+        theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W)
+        p = self.conv_out(theta_phi_g)
+        p = self.bn(p)
+        return x_identity + p
+class SE(nn.Module):
+    """Squeeze-and-Excitation (SE) block w/ Swish: AvgPool, FC, Swish, FC, Sigmoid."""
+    def _round_width(self, width, multiplier, min_width=8, divisor=8):
+        """
+        Round width of filters based on width multiplier
+        Args:
+            width (int): the channel dimensions of the input.
+            multiplier (float): the multiplication factor.
+            min_width (int): the minimum width after multiplication.
+            divisor (int): the new width should be dividable by divisor.
+        """
+        if not multiplier:
+            return width
+        width *= multiplier
+        min_width = min_width or divisor
+        width_out = max(min_width, int(width + divisor / 2) // divisor * divisor)
+        if width_out < 0.9 * width:
+            width_out += divisor
+        return int(width_out)
+    def __init__(self, dim_in, ratio, relu_act=True):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            ratio (float): the channel reduction ratio for squeeze.
+            relu_act (bool): whether to use ReLU activation instead
+                of Swish (default).
+            divisor (int): the new width should be dividable by divisor.
+        """
+        super(SE, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        dim_fc = self._round_width(dim_in, ratio)
+        self.fc1 = nn.Conv3d(dim_in, dim_fc, 1, bias=True)
+        self.fc1_act = nn.ReLU() if relu_act else Swish()
+        self.fc2 = nn.Conv3d(dim_fc, dim_in, 1, bias=True)
+        self.fc2_sig = nn.Sigmoid()
+    def forward(self, x):
+        x_in = x
+        for module in self.children():
+            x = module(x)
+        return x_in * x
+def get_trans_func(name):
+    """
+    Retrieves the transformation module by name.
+    """
+    trans_funcs = {
+        "bottleneck_transform": BottleneckTransform,
+        "basic_transform": BasicTransform,
+        "x3d_transform": X3DTransform,
+    }
+    assert (
+        name in trans_funcs.keys()
+    ), "Transformation function '{}' not supported".format(name)
+    return trans_funcs[name]
+class BasicTransform(nn.Module):
+    """
+    Basic transformation: Tx3x3, 1x3x3, where T is the size of temporal kernel.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner=None,
+        num_groups=1,
+        stride_1x1=None,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        block_idx=0,
+    ):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the first
+                convolution in the basic block.
+            stride (int): the stride of the bottleneck.
+            dim_inner (None): the inner dimension would not be used in
+                BasicTransform.
+            num_groups (int): number of groups for the convolution. Number of
+                group is always 1 for BasicTransform.
+            stride_1x1 (None): stride_1x1 will not be used in BasicTransform.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(BasicTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._construct(dim_in, dim_out, stride, dilation, norm_module)
+    def _construct(self, dim_in, dim_out, stride, dilation, norm_module):
+        # Tx3x3, BN, ReLU.
+        self.a = nn.Conv3d(
+            dim_in,
+            dim_out,
+            kernel_size=[self.temp_kernel_size, 3, 3],
+            stride=[1, stride, stride],
+            padding=[int(self.temp_kernel_size // 2), 1, 1],
+            bias=False,
+        )
+        self.a_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.a_relu = nn.ReLU(inplace=self._inplace_relu)
+        # 1x3x3, BN.
+        self.b = nn.Conv3d(
+            dim_out,
+            dim_out,
+            kernel_size=[1, 3, 3],
+            stride=[1, 1, 1],
+            padding=[0, dilation, dilation],
+            dilation=[1, dilation, dilation],
+            bias=False,
+        )
+        self.b.final_conv = True
+        self.b_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.b_bn.transform_final_bn = True
+    def forward(self, x):
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = self.a_relu(x)
+        x = self.b(x)
+        x = self.b_bn(x)
+        return x
+class X3DTransform(nn.Module):
+    """
+    X3D transformation: 1x1x1, Tx3x3 (channelwise, num_groups=dim_in), 1x1x1,
+        augmented with (optional) SE (squeeze-excitation) on the 3x3x3 output.
+        T is the temporal kernel size (defaulting to 3)
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1=False,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        se_ratio=0.0625,
+        swish_inner=True,
+        block_idx=0,
+    ):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            dilation (int): size of dilation.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            se_ratio (float): if > 0, apply SE to the Tx3x3 conv, with the SE
+                channel dimensionality being se_ratio times the Tx3x3 conv dim.
+            swish_inner (bool): if True, apply swish to the Tx3x3 conv, otherwise
+                apply ReLU to the Tx3x3 conv.
+        """
+        super(X3DTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._se_ratio = se_ratio
+        self._swish_inner = swish_inner
+        self._stride_1x1 = stride_1x1
+        self._block_idx = block_idx
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            dilation,
+            norm_module,
+        )
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        dilation,
+        norm_module,
+    ):
+        (str1x1, str3x3) = (stride, 1) if self._stride_1x1 else (1, stride)
+        # 1x1x1, BN, ReLU.
+        self.a = nn.Conv3d(
+            dim_in,
+            dim_inner,
+            kernel_size=[1, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[0, 0, 0],
+            bias=False,
+        )
+        self.a_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.a_relu = nn.ReLU(inplace=self._inplace_relu)
+        # Tx3x3, BN, ReLU.
+        self.b = nn.Conv3d(
+            dim_inner,
+            dim_inner,
+            [self.temp_kernel_size, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[int(self.temp_kernel_size // 2), dilation, dilation],
+            groups=num_groups,
+            bias=False,
+            dilation=[1, dilation, dilation],
+        )
+        self.b_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+        # Apply SE attention or not
+        use_se = True if (self._block_idx + 1) % 2 else False
+        if self._se_ratio > 0.0 and use_se:
+            self.se = SE(dim_inner, self._se_ratio)
+        if self._swish_inner:
+            self.b_relu = Swish()
+        else:
+            self.b_relu = nn.ReLU(inplace=self._inplace_relu)
+        # 1x1x1, BN.
+        self.c = nn.Conv3d(
+            dim_inner,
+            dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            bias=False,
+        )
+        self.c_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.c_bn.transform_final_bn = True
+    def forward(self, x):
+        for block in self.children():
+            x = block(x)
+        return x
+class BottleneckTransform(nn.Module):
+    """
+    Bottleneck transformation: Tx1x1, 1x3x3, 1x1x1, where T is the size of
+        temporal kernel.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        dim_inner,
+        num_groups,
+        stride_1x1=False,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        block_idx=0,
+    ):
+        """
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the first
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            dilation (int): size of dilation.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(BottleneckTransform, self).__init__()
+        self.temp_kernel_size = temp_kernel_size
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._stride_1x1 = stride_1x1
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            dilation,
+            norm_module,
+        )
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        dilation,
+        norm_module,
+    ):
+        (str1x1, str3x3) = (stride, 1) if self._stride_1x1 else (1, stride)
+        # Tx1x1, BN, ReLU.
+        self.a = nn.Conv3d(
+            dim_in,
+            dim_inner,
+            kernel_size=[self.temp_kernel_size, 1, 1],
+            stride=[1, str1x1, str1x1],
+            padding=[int(self.temp_kernel_size // 2), 0, 0],
+            bias=False,
+        )
+        self.a_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.a_relu = nn.ReLU(inplace=self._inplace_relu)
+        # 1x3x3, BN, ReLU.
+        self.b = nn.Conv3d(
+            dim_inner,
+            dim_inner,
+            [1, 3, 3],
+            stride=[1, str3x3, str3x3],
+            padding=[0, dilation, dilation],
+            groups=num_groups,
+            bias=False,
+            dilation=[1, dilation, dilation],
+        )
+        self.b_bn = norm_module(
+            num_features=dim_inner, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.b_relu = nn.ReLU(inplace=self._inplace_relu)
+        # 1x1x1, BN.
+        self.c = nn.Conv3d(
+            dim_inner,
+            dim_out,
+            kernel_size=[1, 1, 1],
+            stride=[1, 1, 1],
+            padding=[0, 0, 0],
+            bias=False,
+        )
+        self.c.final_conv = True
+        self.c_bn = norm_module(
+            num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+        )
+        self.c_bn.transform_final_bn = True
+    def forward(self, x):
+        # Explicitly forward every layer.
+        # Branch2a.
+        x = self.a(x)
+        x = self.a_bn(x)
+        x = self.a_relu(x)
+        # Branch2b.
+        x = self.b(x)
+        x = self.b_bn(x)
+        x = self.b_relu(x)
+        # Branch2c
+        x = self.c(x)
+        x = self.c_bn(x)
+        return x
+class ResBlock(nn.Module):
+    """
+    Residual block.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        trans_func,
+        dim_inner,
+        num_groups=1,
+        stride_1x1=False,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        dilation=1,
+        norm_module=nn.BatchNorm3d,
+        block_idx=0,
+        drop_connect_rate=0.0,
+    ):
+        """
+        ResBlock class constructs redisual blocks. More details can be found in:
+            Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+            "Deep residual learning for image recognition."
+            https://arxiv.org/abs/1512.03385
+        Args:
+            dim_in (int): the channel dimensions of the input.
+            dim_out (int): the channel dimension of the output.
+            temp_kernel_size (int): the temporal kernel sizes of the middle
+                convolution in the bottleneck.
+            stride (int): the stride of the bottleneck.
+            trans_func (string): transform function to be used to construct the
+                bottleneck.
+            dim_inner (int): the inner dimension of the block.
+            num_groups (int): number of groups for the convolution. num_groups=1
+                is for standard ResNet like networks, and num_groups>1 is for
+                ResNeXt like networks.
+            stride_1x1 (bool): if True, apply stride to 1x1 conv, otherwise
+                apply stride to the 3x3 conv.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            dilation (int): size of dilation.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            drop_connect_rate (float): basic rate at which blocks are dropped,
+                linearly increases from input to output blocks.
+        """
+        super(ResBlock, self).__init__()
+        self._inplace_relu = inplace_relu
+        self._eps = eps
+        self._bn_mmt = bn_mmt
+        self._drop_connect_rate = drop_connect_rate
+        self._construct(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            trans_func,
+            dim_inner,
+            num_groups,
+            stride_1x1,
+            inplace_relu,
+            dilation,
+            norm_module,
+            block_idx,
+        )
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        temp_kernel_size,
+        stride,
+        trans_func,
+        dim_inner,
+        num_groups,
+        stride_1x1,
+        inplace_relu,
+        dilation,
+        norm_module,
+        block_idx,
+    ):
+        # Use skip connection with projection if dim or res change.
+        if (dim_in != dim_out) or (stride != 1):
+            self.branch1 = nn.Conv3d(
+                dim_in,
+                dim_out,
+                kernel_size=1,
+                stride=[1, stride, stride],
+                padding=0,
+                bias=False,
+                dilation=1,
+            )
+            self.branch1_bn = norm_module(
+                num_features=dim_out, eps=self._eps, momentum=self._bn_mmt
+            )
+        self.branch2 = trans_func(
+            dim_in,
+            dim_out,
+            temp_kernel_size,
+            stride,
+            dim_inner,
+            num_groups,
+            stride_1x1=stride_1x1,
+            inplace_relu=inplace_relu,
+            dilation=dilation,
+            norm_module=norm_module,
+            block_idx=block_idx,
+        )
+        self.relu = nn.ReLU(self._inplace_relu)
+    def forward(self, x):
+        f_x = self.branch2(x)
+        if self.training and self._drop_connect_rate > 0.0:
+            f_x = drop_path(f_x, self._drop_connect_rate)
+        if hasattr(self, "branch1"):
+            x = self.branch1_bn(self.branch1(x)) + f_x
+        else:
+            x = x + f_x
+        x = self.relu(x)
+        return x
+class ResStage(nn.Module):
+    """
+    Stage of 3D ResNet. It expects to have one or more tensors as input for
+        single pathway (C2D, I3D, Slow), and multi-pathway (SlowFast) cases.
+        More details can be found here:
+        Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
+        "SlowFast networks for video recognition."
+        https://arxiv.org/pdf/1812.03982.pdf
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        temp_kernel_sizes,
+        num_blocks,
+        dim_inner,
+        num_groups,
+        num_block_temp_kernel,
+        nonlocal_inds,
+        nonlocal_group,
+        nonlocal_pool,
+        dilation,
+        instantiation="softmax",
+        trans_func_name="bottleneck_transform",
+        stride_1x1=False,
+        inplace_relu=True,
+        norm_module=nn.BatchNorm3d,
+        drop_connect_rate=0.0,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        ResStage builds p streams, where p can be greater or equal to one.
+        Args:
+            dim_in (list): list of p the channel dimensions of the input.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            dim_out (list): list of p the channel dimensions of the output.
+                Different channel dimensions control the input dimension of
+                different pathways.
+            temp_kernel_sizes (list): list of the p temporal kernel sizes of the
+                convolution in the bottleneck. Different temp_kernel_sizes
+                control different pathway.
+            stride (list): list of the p strides of the bottleneck. Different
+                stride control different pathway.
+            num_blocks (list): list of p numbers of blocks for each of the
+                pathway.
+            dim_inner (list): list of the p inner channel dimensions of the
+                input. Different channel dimensions control the input dimension
+                of different pathways.
+            num_groups (list): list of number of p groups for the convolution.
+                num_groups=1 is for standard ResNet like networks, and
+                num_groups>1 is for ResNeXt like networks.
+            num_block_temp_kernel (list): extent the temp_kernel_sizes to
+                num_block_temp_kernel blocks, then fill temporal kernel size
+                of 1 for the rest of the layers.
+            nonlocal_inds (list): If the tuple is empty, no nonlocal layer will
+                be added. If the tuple is not empty, add nonlocal layers after
+                the index-th block.
+            dilation (list): size of dilation for each pathway.
+            nonlocal_group (list): list of number of p nonlocal groups. Each
+                number controls how to fold temporal dimension to batch
+                dimension before applying nonlocal transformation.
+                https://github.com/facebookresearch/video-nonlocal-net.
+            instantiation (string): different instantiation for nonlocal layer.
+                Supports two different instantiation method:
+                    "dot_product": normalizing correlation matrix with L2.
+                    "softmax": normalizing correlation matrix with Softmax.
+            trans_func_name (string): name of the the transformation function apply
+                on the network.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            drop_connect_rate (float): basic rate at which blocks are dropped,
+                linearly increases from input to output blocks.
+        """
+        super(ResStage, self).__init__()
+        assert all(
+            (
+                num_block_temp_kernel[i] <= num_blocks[i]
+                for i in range(len(temp_kernel_sizes))
+            )
+        )
+        self.num_blocks = num_blocks
+        self.nonlocal_group = nonlocal_group
+        self._drop_connect_rate = drop_connect_rate
+        self.temp_kernel_sizes = [
+            (temp_kernel_sizes[i] * num_blocks[i])[: num_block_temp_kernel[i]]
+            + [1] * (num_blocks[i] - num_block_temp_kernel[i])
+            for i in range(len(temp_kernel_sizes))
+        ]
+        assert (
+            len(
+                {
+                    len(dim_in),
+                    len(dim_out),
+                    len(temp_kernel_sizes),
+                    len(stride),
+                    len(num_blocks),
+                    len(dim_inner),
+                    len(num_groups),
+                    len(num_block_temp_kernel),
+                    len(nonlocal_inds),
+                    len(nonlocal_group),
+                }
+            )
+            == 1
+        )
+        self.num_pathways = len(self.num_blocks)
+        self._construct(
+            dim_in,
+            dim_out,
+            stride,
+            dim_inner,
+            num_groups,
+            trans_func_name,
+            stride_1x1,
+            inplace_relu,
+            nonlocal_inds,
+            nonlocal_pool,
+            instantiation,
+            dilation,
+            norm_module,
+        )
+    def _construct(
+        self,
+        dim_in,
+        dim_out,
+        stride,
+        dim_inner,
+        num_groups,
+        trans_func_name,
+        stride_1x1,
+        inplace_relu,
+        nonlocal_inds,
+        nonlocal_pool,
+        instantiation,
+        dilation,
+        norm_module,
+    ):
+        for pathway in range(self.num_pathways):
+            for i in range(self.num_blocks[pathway]):
+                # Retrieve the transformation function.
+                trans_func = get_trans_func(trans_func_name)
+                # Construct the block.
+                res_block = ResBlock(
+                    dim_in[pathway] if i == 0 else dim_out[pathway],
+                    dim_out[pathway],
+                    self.temp_kernel_sizes[pathway][i],
+                    stride[pathway] if i == 0 else 1,
+                    trans_func,
+                    dim_inner[pathway],
+                    num_groups[pathway],
+                    stride_1x1=stride_1x1,
+                    inplace_relu=inplace_relu,
+                    dilation=dilation[pathway],
+                    norm_module=norm_module,
+                    block_idx=i,
+                    drop_connect_rate=self._drop_connect_rate,
+                )
+                self.add_module("pathway{}_res{}".format(
+                    pathway, i), res_block)
+                if i in nonlocal_inds[pathway]:
+                    nln = Nonlocal(
+                        dim_out[pathway],
+                        dim_out[pathway] // 2,
+                        nonlocal_pool[pathway],
+                        instantiation=instantiation,
+                        norm_module=norm_module,
+                    )
+                    self.add_module(
+                        "pathway{}_nonlocal{}".format(pathway, i), nln)
+    def forward(self, inputs):
+        output = []
+        for pathway in range(self.num_pathways):
+            x = inputs[pathway]
+            for i in range(self.num_blocks[pathway]):
+                m = getattr(self, "pathway{}_res{}".format(pathway, i))
+                x = m(x)
+                if hasattr(self, "pathway{}_nonlocal{}".format(pathway, i)):
+                    nln = getattr(
+                        self, "pathway{}_nonlocal{}".format(pathway, i))
+                    b, c, t, h, w = x.shape
+                    if self.nonlocal_group[pathway] > 1:
+                        # Fold temporal dimension into batch dimension.
+                        x = x.permute(0, 2, 1, 3, 4)
+                        x = x.reshape(
+                            b * self.nonlocal_group[pathway],
+                            t // self.nonlocal_group[pathway],
+                            c,
+                            h,
+                            w,
+                        )
+                        x = x.permute(0, 2, 1, 3, 4)
+                    x = nln(x)
+                    if self.nonlocal_group[pathway] > 1:
+                        # Fold back to temporal dimension.
+                        x = x.permute(0, 2, 1, 3, 4)
+                        x = x.reshape(b, t, c, h, w)
+                        x = x.permute(0, 2, 1, 3, 4)
+            output.append(x)
+        return output

helpers/stem.py ADDED Viewed

	@@ -0,0 +1,320 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""ResNe(X)t 3D stem helper."""
+import torch.nn as nn
+def get_stem_func(name):
+    """
+    Retrieves the stem module by name.
+    """
+    trans_funcs = {"x3d_stem": X3DStem, "basic_stem": ResNetBasicStem}
+    assert (
+        name in trans_funcs.keys()
+    ), "Transformation function '{}' not supported".format(name)
+    return trans_funcs[name]
+class VideoModelStem(nn.Module):
+    """
+    Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
+    on input data tensor for one or multiple pathways.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel,
+        stride,
+        padding,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+        stem_func_name="basic_stem",
+    ):
+        """
+        The `__init__` method of any subclass should also contain these
+        arguments. List size of 1 for single pathway models (C2D, I3D, Slow
+        and etc), list size of 2 for two pathway models (SlowFast).
+        Args:
+            dim_in (list): the list of channel dimensions of the inputs.
+            dim_out (list): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernels' size of the convolutions in the stem
+                layers. Temporal kernel size, height kernel size, width kernel
+                size in order.
+            stride (list): the stride sizes of the convolutions in the stem
+                layer. Temporal kernel stride, height kernel size, width kernel
+                size in order.
+            padding (list): the paddings' sizes of the convolutions in the stem
+                layer. Temporal padding size, height padding size, width padding
+                size in order.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            stem_func_name (string): name of the the stem function applied on
+                input to the network.
+        """
+        super(VideoModelStem, self).__init__()
+        assert (
+            len(
+                {
+                    len(dim_in),
+                    len(dim_out),
+                    len(kernel),
+                    len(stride),
+                    len(padding),
+                }
+            )
+            == 1
+        ), "Input pathway dimensions are not consistent. {} {} {} {} {}".format(
+            len(dim_in),
+            len(dim_out),
+            len(kernel),
+            len(stride),
+            len(padding),
+        )
+        self.num_pathways = len(dim_in)
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.inplace_relu = inplace_relu
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        # Construct the stem layer.
+        self._construct_stem(dim_in, dim_out, norm_module, stem_func_name)
+    def _construct_stem(self, dim_in, dim_out, norm_module, stem_func_name):
+        trans_func = get_stem_func(stem_func_name)
+        for pathway in range(len(dim_in)):
+            stem = trans_func(
+                dim_in[pathway],
+                dim_out[pathway],
+                self.kernel[pathway],
+                self.stride[pathway],
+                self.padding[pathway],
+                self.inplace_relu,
+                self.eps,
+                self.bn_mmt,
+                norm_module,
+            )
+            self.add_module("pathway{}_stem".format(pathway), stem)
+    def forward(self, x):
+        assert (
+            len(x) == self.num_pathways
+        ), "Input tensor does not contain {} pathway".format(self.num_pathways)
+        # use a new list, don't modify in-place the x list, which is bad for activation checkpointing.
+        y = []
+        for pathway in range(len(x)):
+            m = getattr(self, "pathway{}_stem".format(pathway))
+            y.append(m(x[pathway]))
+        return y
+class ResNetBasicStem(nn.Module):
+    """
+    ResNe(X)t 3D stem module.
+    Performs spatiotemporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel,
+        stride,
+        padding,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            dim_in (int): the channel dimension of the input. Normally 3 is used
+                for rgb input, and 2 or 3 is used for optical flow input.
+            dim_out (int): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernel size of the convolution in the stem layer.
+                temporal kernel size, height kernel size, width kernel size in
+                order.
+            stride (list): the stride size of the convolution in the stem layer.
+                temporal kernel stride, height kernel size, width kernel size in
+                order.
+            padding (int): the padding size of the convolution in the stem
+                layer, temporal padding size, height padding size, width
+                padding size in order.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(ResNetBasicStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.inplace_relu = inplace_relu
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        # Construct the stem layer.
+        self._construct_stem(dim_in, dim_out, norm_module)
+    def _construct_stem(self, dim_in, dim_out, norm_module):
+        self.conv = nn.Conv3d(
+            dim_in,
+            dim_out,
+            self.kernel,
+            stride=self.stride,
+            padding=self.padding,
+            bias=False,
+        )
+        self.bn = norm_module(num_features=dim_out, eps=self.eps, momentum=self.bn_mmt)
+        self.relu = nn.ReLU(self.inplace_relu)
+        self.pool_layer = nn.MaxPool3d(
+            kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1]
+        )
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.pool_layer(x)
+        return x
+class X3DStem(nn.Module):
+    """
+    X3D's 3D stem module.
+    Performs a spatial followed by a depthwise temporal Convolution, BN, and Relu following by a
+        spatiotemporal pooling.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel,
+        stride,
+        padding,
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            dim_in (int): the channel dimension of the input. Normally 3 is used
+                for rgb input, and 2 or 3 is used for optical flow input.
+            dim_out (int): the output dimension of the convolution in the stem
+                layer.
+            kernel (list): the kernel size of the convolution in the stem layer.
+                temporal kernel size, height kernel size, width kernel size in
+                order.
+            stride (list): the stride size of the convolution in the stem layer.
+                temporal kernel stride, height kernel size, width kernel size in
+                order.
+            padding (int): the padding size of the convolution in the stem
+                layer, temporal padding size, height padding size, width
+                padding size in order.
+            inplace_relu (bool): calculate the relu on the original input
+                without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+        """
+        super(X3DStem, self).__init__()
+        self.kernel = kernel
+        self.stride = stride
+        self.padding = padding
+        self.inplace_relu = inplace_relu
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        # Construct the stem layer.
+        self._construct_stem(dim_in, dim_out, norm_module)
+    def _construct_stem(self, dim_in, dim_out, norm_module):
+        self.conv_xy = nn.Conv3d(
+            dim_in,
+            dim_out,
+            kernel_size=(1, self.kernel[1], self.kernel[2]),
+            stride=(1, self.stride[1], self.stride[2]),
+            padding=(0, self.padding[1], self.padding[2]),
+            bias=False,
+        )
+        self.conv = nn.Conv3d(
+            dim_out,
+            dim_out,
+            kernel_size=(self.kernel[0], 1, 1),
+            stride=(self.stride[0], 1, 1),
+            padding=(self.padding[0], 0, 0),
+            bias=False,
+            groups=dim_out,
+        )
+        self.bn = norm_module(num_features=dim_out, eps=self.eps, momentum=self.bn_mmt)
+        self.relu = nn.ReLU(self.inplace_relu)
+    def forward(self, x):
+        x = self.conv_xy(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class PatchEmbed(nn.Module):
+    """
+    PatchEmbed.
+    """
+    def __init__(
+        self,
+        dim_in=3,
+        dim_out=768,
+        kernel=(1, 16, 16),
+        stride=(1, 4, 4),
+        padding=(1, 7, 7),
+        conv_2d=False,
+    ):
+        super().__init__()
+        if conv_2d:
+            conv = nn.Conv2d
+        else:
+            conv = nn.Conv3d
+        self.proj = conv(
+            dim_in,
+            dim_out,
+            kernel_size=kernel,
+            stride=stride,
+            padding=padding,
+        )
+    def forward(self, x, keep_spatial=False):
+        x = self.proj(x)
+        if keep_spatial:
+            return x, x.shape
+        # B C (T) H W -> B (T)HW C
+        return x.flatten(2).transpose(1, 2), x.shape

modeling_x3d.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transformers import PreTrainedModel
+from x3d_model.configuration_x3d import X3DConfig
+from x3d_model.x3d import build_model
+class X3DModel(PreTrainedModel):
+    config_class = X3DConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = build_model(config.cfg)
+    def forward(self, input_video):
+        outputs = self.model(input_video)
+        return outputs

x3d.py ADDED Viewed

	@@ -0,0 +1,350 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+import torch
+from torch import nn
+from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks_default
+from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
+from .helpers.norm import get_norm
+from .helpers.stem import VideoModelStem
+from .helpers.resnet import ResStage
+from .helpers.head import X3DHead
+# round width
+def round_width(width, multiplier, min_width=1, divisor=1):
+    if not multiplier:
+        return width
+    width *= multiplier
+    min_width = min_width or divisor
+    width_out = max(min_width, int(width + divisor / 2) // divisor * divisor)
+    if width_out < 0.9 * width:
+        width_out += divisor
+    return int(width_out)
+# init weights
+def init_weights(
+    model, fc_init_std=0.01, zero_init_final_bn=True, zero_init_final_conv=False
+):
+    """
+    Performs ResNet style weight initialization.
+    Args:
+        fc_init_std (float): the expected standard deviation for fc layer.
+        zero_init_final_bn (bool): if True, zero initialize the final bn for
+            every bottleneck.
+    """
+    for m in model.modules():
+        if isinstance(m, nn.Conv3d):
+            # Note that there is no bias due to BN
+            if hasattr(m, "final_conv") and zero_init_final_conv:
+                m.weight.data.zero_()
+            else:
+                """
+                Follow the initialization method proposed in:
+                {He, Kaiming, et al.
+                "Delving deep into rectifiers: Surpassing human-level
+                performance on imagenet classification."
+                arXiv preprint arXiv:1502.01852 (2015)}
+                """
+                c2_msra_fill(m)
+        elif isinstance(m, (nn.BatchNorm3d, nn.BatchNorm2d, nn.BatchNorm1d)):
+            if (
+                hasattr(m, "transform_final_bn")
+                and m.transform_final_bn
+                and zero_init_final_bn
+            ):
+                batchnorm_weight = 0.0
+            else:
+                batchnorm_weight = 1.0
+            if m.weight is not None:
+                m.weight.data.fill_(batchnorm_weight)
+            if m.bias is not None:
+                m.bias.data.zero_()
+        if isinstance(m, nn.Linear):
+            if hasattr(m, "xavier_init") and m.xavier_init:
+                c2_xavier_fill(m)
+            else:
+                m.weight.data.normal_(mean=0.0, std=fc_init_std)
+            if m.bias is not None:
+                m.bias.data.zero_()
+# pool1
+_POOL1 = {
+    "2d": [[1, 1, 1]],
+    "c2d": [[2, 1, 1]],
+    "slow_c2d": [[1, 1, 1]],
+    "i3d": [[2, 1, 1]],
+    "slow_i3d": [[1, 1, 1]],
+    "slow": [[1, 1, 1]],
+    "slowfast": [[1, 1, 1], [1, 1, 1]],
+    "x3d": [[1, 1, 1]],
+}
+# temporal kernel basis
+_TEMPORAL_KERNEL_BASIS = {
+    "2d": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[1]],  # res4 temporal kernel.
+        [[1]],  # res5 temporal kernel.
+    ],
+    "c2d": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[1]],  # res4 temporal kernel.
+        [[1]],  # res5 temporal kernel.
+    ],
+    "slow_c2d": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[1]],  # res4 temporal kernel.
+        [[1]],  # res5 temporal kernel.
+    ],
+    "i3d": [
+        [[5]],  # conv1 temporal kernel.
+        [[3]],  # res2 temporal kernel.
+        [[3, 1]],  # res3 temporal kernel.
+        [[3, 1]],  # res4 temporal kernel.
+        [[1, 3]],  # res5 temporal kernel.
+    ],
+    "slow_i3d": [
+        [[5]],  # conv1 temporal kernel.
+        [[3]],  # res2 temporal kernel.
+        [[3, 1]],  # res3 temporal kernel.
+        [[3, 1]],  # res4 temporal kernel.
+        [[1, 3]],  # res5 temporal kernel.
+    ],
+    "slow": [
+        [[1]],  # conv1 temporal kernel.
+        [[1]],  # res2 temporal kernel.
+        [[1]],  # res3 temporal kernel.
+        [[3]],  # res4 temporal kernel.
+        [[3]],  # res5 temporal kernel.
+    ],
+    "slowfast": [
+        [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.
+        [[1], [3]],  # res2 temporal kernel for slow and fast pathway.
+        [[1], [3]],  # res3 temporal kernel for slow and fast pathway.
+        [[3], [3]],  # res4 temporal kernel for slow and fast pathway.
+        [[3], [3]],  # res5 temporal kernel for slow and fast pathway.
+    ],
+    "x3d": [
+        [[5]],  # conv1 temporal kernels.
+        [[3]],  # res2 temporal kernels.
+        [[3]],  # res3 temporal kernels.
+        [[3]],  # res4 temporal kernels.
+        [[3]],  # res5 temporal kernels.
+    ],
+}
+# model stage depth
+_MODEL_STAGE_DEPTH = {18: (2, 2, 2, 2), 50: (3, 4, 6, 3), 101: (3, 4, 23, 3)}
+# X3D model
+class X3D(nn.Module):
+    """
+    X3D model builder. It builds a X3D network backbone, which is a ResNet.
+    Christoph Feichtenhofer.
+    "X3D: Expanding Architectures for Efficient Video Recognition."
+    https://arxiv.org/abs/2004.04730
+    """
+    def __init__(self, cfg):
+        """
+        The `__init__` method of any subclass should also contain these
+            arguments.
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        super(X3D, self).__init__()
+        self.norm_module = get_norm(cfg)
+        self.enable_detection = cfg.DETECTION.ENABLE
+        self.num_pathways = 1
+        exp_stage = 2.0
+        self.dim_c1 = cfg.X3D.DIM_C1
+        self.dim_res2 = (
+            round_width(self.dim_c1, exp_stage, divisor=8)
+            if cfg.X3D.SCALE_RES2
+            else self.dim_c1
+        )
+        self.dim_res3 = round_width(self.dim_res2, exp_stage, divisor=8)
+        self.dim_res4 = round_width(self.dim_res3, exp_stage, divisor=8)
+        self.dim_res5 = round_width(self.dim_res4, exp_stage, divisor=8)
+        self.block_basis = [
+            # blocks, c, stride
+            [1, self.dim_res2, 2],
+            [2, self.dim_res3, 2],
+            [5, self.dim_res4, 2],
+            [3, self.dim_res5, 2],
+        ]
+        self._construct_network(cfg)
+        init_weights(
+            self, cfg.MODEL.FC_INIT_STD, cfg.RESNET.ZERO_INIT_FINAL_BN
+        )
+    def _round_repeats(self, repeats, multiplier):
+        """Round number of layers based on depth multiplier."""
+        if not multiplier:
+            return repeats
+        return int(math.ceil(multiplier * repeats))
+    def _construct_network(self, cfg):
+        """
+        Builds a single pathway X3D model.
+        Args:
+            cfg (CfgNode): model building configs, details are in the
+                comments of the config file.
+        """
+        assert cfg.MODEL.ARCH in _POOL1.keys()
+        assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys()
+        (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH]
+        num_groups = cfg.RESNET.NUM_GROUPS
+        width_per_group = cfg.RESNET.WIDTH_PER_GROUP
+        dim_inner = num_groups * width_per_group
+        w_mul = cfg.X3D.WIDTH_FACTOR
+        d_mul = cfg.X3D.DEPTH_FACTOR
+        dim_res1 = round_width(self.dim_c1, w_mul)
+        temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH]
+        self.s1 = VideoModelStem(
+            dim_in=cfg.DATA.INPUT_CHANNEL_NUM,
+            dim_out=[dim_res1],
+            kernel=[temp_kernel[0][0] + [3, 3]],
+            stride=[[1, 2, 2]],
+            padding=[[temp_kernel[0][0][0] // 2, 1, 1]],
+            norm_module=self.norm_module,
+            stem_func_name="x3d_stem",
+        )
+        # blob_in = s1
+        dim_in = dim_res1
+        for stage, block in enumerate(self.block_basis):
+            dim_out = round_width(block[1], w_mul)
+            dim_inner = int(cfg.X3D.BOTTLENECK_FACTOR * dim_out)
+            n_rep = self._round_repeats(block[0], d_mul)
+            # start w res2 to follow convention
+            prefix = "s{}".format(stage + 2)
+            s = ResStage(
+                dim_in=[dim_in],
+                dim_out=[dim_out],
+                dim_inner=[dim_inner],
+                temp_kernel_sizes=temp_kernel[1],
+                stride=[block[2]],
+                num_blocks=[n_rep],
+                num_groups=[dim_inner] if cfg.X3D.CHANNELWISE_3x3x3 else [
+                    num_groups],
+                num_block_temp_kernel=[n_rep],
+                nonlocal_inds=cfg.NONLOCAL.LOCATION[0],
+                nonlocal_group=cfg.NONLOCAL.GROUP[0],
+                nonlocal_pool=cfg.NONLOCAL.POOL[0],
+                instantiation=cfg.NONLOCAL.INSTANTIATION,
+                trans_func_name=cfg.RESNET.TRANS_FUNC,
+                stride_1x1=cfg.RESNET.STRIDE_1X1,
+                norm_module=self.norm_module,
+                dilation=cfg.RESNET.SPATIAL_DILATIONS[stage],
+                drop_connect_rate=cfg.MODEL.DROPCONNECT_RATE
+                * (stage + 2)
+                / (len(self.block_basis) + 1),
+            )
+            dim_in = dim_out
+            self.add_module(prefix, s)
+        if self.enable_detection:
+            NotImplementedError
+        else:
+            spat_sz = int(math.ceil(cfg.DATA.TRAIN_CROP_SIZE / 32.0))
+            self.head = X3DHead(
+                dim_in=dim_out,
+                dim_inner=dim_inner,
+                dim_out=cfg.X3D.DIM_C5,
+                num_classes=cfg.MODEL.NUM_CLASSES,
+                pool_size=[cfg.DATA.NUM_FRAMES, spat_sz, spat_sz],
+                dropout_rate=cfg.MODEL.DROPOUT_RATE,
+                act_func=cfg.MODEL.HEAD_ACT,
+                bn_lin5_on=cfg.X3D.BN_LIN5,
+            )
+    def forward(self, x, bboxes=None):
+        for module in self.children():
+            x = module(x)
+        return x
+def build_model(cfg, gpu_id=None):
+    if torch.cuda.is_available():
+        assert (
+            cfg.NUM_GPUS <= torch.cuda.device_count()
+        ), "Cannot use more GPU devices than available"
+    else:
+        assert (
+            cfg.NUM_GPUS == 0
+        ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs."
+    # Construct the model
+    model = X3D(cfg)
+    if cfg.BN.NORM_TYPE == "sync_batchnorm_apex":
+        try:
+            import apex
+        except ImportError:
+            raise ImportError("APEX is required for this model, pelase install")
+        process_group = apex.parallel.create_syncbn_process_group(
+            group_size=cfg.BN.NUM_SYNC_DEVICES
+        )
+        model = apex.parallel.convert_syncbn_model(model, process_group=process_group)
+    if cfg.NUM_GPUS:
+        if gpu_id is None:
+            # Determine the GPU used by the current process
+            cur_device = torch.cuda.current_device()
+        else:
+            cur_device = gpu_id
+        # Transfer the model to the current GPU device
+        model = model.cuda(device=cur_device)
+    # Use multi-process data parallel model in the multi-gpu setting
+    if cfg.NUM_GPUS > 1:
+        # Make model replica operate on the current device
+        model = torch.nn.parallel.DistributedDataParallel(
+            module=model,
+            device_ids=[cur_device],
+            output_device=cur_device,
+            find_unused_parameters=(
+                True
+                if cfg.MODEL.DETACH_FINAL_FC
+                or cfg.MODEL.MODEL_NAME == "ContrastiveModel"
+                else False
+            ),
+        )
+        if cfg.MODEL.FP16_ALLREDUCE:
+            model.register_comm_hook(
+                state=None, hook=comm_hooks_default.fp16_compress_hook
+            )
+    return model