# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import logging import unittest from typing import Callable, Tuple import torch import torch.nn as nn from fvcore.common.benchmark import benchmark from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import ( Conv3d3x3x3DwBnAct, Conv3dPwBnAct, ) from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import ( X3dBottleneckBlock, ) from torch.utils.mobile_optimizer import optimize_for_mobile TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2]) if TORCH_VERSION >= (1, 11): from torch.ao.quantization import ( convert, DeQuantStub, fuse_modules, get_default_qconfig, prepare, QuantStub, # quantize_fx ) else: from torch.quantization import ( convert, DeQuantStub, fuse_modules, get_default_qconfig, prepare, QuantStub, # quantize_fx ) class TestBenchmarkEfficientBlocks(unittest.TestCase): def setUp(self): super().setUp() torch.set_rng_state(torch.manual_seed(42).get_state()) def test_benchmark_conv3d_pw_bn_relu(self, num_iters: int = 20) -> None: """ Benchmark Conv3dPwBnAct with ReLU activation. Note efficient block Conv3dPwBnAct is designed for mobile cpu with qnnpack backend, and benchmarking on server with another backend (e.g., fbgemm) may have different latency result compared to running on mobile cpu with qnnpack. Running on x86 based server cpu with qnnpack may also have different latency as running on mobile cpu with qnnpack, as qnnpack is optimized for ARM based mobile cpu. Args: num_iters (int): number of iterations to perform benchmarking. """ torch.backends.quantized.engine = "qnnpack" kwargs_list = [ { "mode": "original", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "out_channels": 108, "quantize": False, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "out_channels": 108, "quantize": False, }, { "mode": "original", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "out_channels": 108, "quantize": True, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "out_channels": 108, "quantize": True, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "out_channels": 108, "quantize": True, "native_conv3d_op_qnnpack": True, }, ] def _benchmark_conv3d_pw_bn_relu_forward(**kwargs) -> Callable: assert kwargs["mode"] in ("original", "deployable"), ( "kwargs['mode'] must be either 'original' or 'deployable'," "but got {}.".format(kwargs["mode"]) ) input_tensor = torch.randn((kwargs["input_blob_size"])) conv_block = Conv3dPwBnAct( kwargs["in_channels"], kwargs["out_channels"], use_bn=False, # assume BN has already been fused for forward ) if kwargs["mode"] == "deployable": native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False) conv_block.convert( kwargs["input_blob_size"], convert_for_quantize=kwargs["quantize"], native_conv3d_op_qnnpack=native_conv3d_op_qnnpack, ) conv_block.eval() def func_to_benchmark_dummy() -> None: return if kwargs["quantize"] is True: if kwargs["mode"] == "original": # manually fuse conv and relu conv_block.kernel = fuse_modules( conv_block.kernel, ["conv", "act.act"] ) conv_block = nn.Sequential( QuantStub(), conv_block, DeQuantStub(), ) conv_block.qconfig = get_default_qconfig("qnnpack") conv_block = prepare(conv_block) try: conv_block = convert(conv_block) except Exception as e: logging.info( "benchmark_conv3d_pw_bn_relu: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return func_to_benchmark_dummy try: traced_model = torch.jit.trace(conv_block, input_tensor, strict=False) except Exception as e: logging.info( "benchmark_conv3d_pw_bn_relu: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return func_to_benchmark_dummy if kwargs["quantize"] is False: traced_model = optimize_for_mobile(traced_model) logging.info(f"model arch: {traced_model}") def func_to_benchmark() -> None: try: _ = traced_model(input_tensor) except Exception as e: logging.info( "benchmark_conv3d_pw_bn_relu: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return return func_to_benchmark benchmark( _benchmark_conv3d_pw_bn_relu_forward, "benchmark_conv3d_pw_bn_relu", kwargs_list, num_iters=num_iters, warmup_iters=2, ) self.assertTrue(True) def test_benchmark_conv3d_3x3x3_dw_bn_relu(self, num_iters: int = 20) -> None: """ Benchmark Conv3d3x3x3DwBnAct with ReLU activation. Note efficient block Conv3d3x3x3DwBnAct is designed for mobile cpu with qnnpack backend, and benchmarking on server with another backend (e.g., fbgemm) may have different latency result compared as running on mobile cpu. Args: num_iters (int): number of iterations to perform benchmarking. """ torch.backends.quantized.engine = "qnnpack" kwargs_list = [ { "mode": "original", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "quantize": False, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "quantize": False, }, { "mode": "original", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "quantize": True, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "quantize": True, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 40, 40), "in_channels": 48, "quantize": True, "native_conv3d_op_qnnpack": True, }, ] def _benchmark_conv3d_3x3x3_dw_bn_relu_forward(**kwargs) -> Callable: assert kwargs["mode"] in ("original", "deployable"), ( "kwargs['mode'] must be either 'original' or 'deployable'," "but got {}.".format(kwargs["mode"]) ) input_tensor = torch.randn((kwargs["input_blob_size"])) conv_block = Conv3d3x3x3DwBnAct( kwargs["in_channels"], use_bn=False, # assume BN has already been fused for forward ) def func_to_benchmark_dummy() -> None: return if kwargs["mode"] == "deployable": native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False) conv_block.convert( kwargs["input_blob_size"], convert_for_quantize=kwargs["quantize"], native_conv3d_op_qnnpack=native_conv3d_op_qnnpack, ) conv_block.eval() if kwargs["quantize"] is True: if kwargs["mode"] == "original": # manually fuse conv and relu conv_block.kernel = fuse_modules( conv_block.kernel, ["conv", "act.act"] ) conv_block = nn.Sequential( QuantStub(), conv_block, DeQuantStub(), ) conv_block.qconfig = get_default_qconfig("qnnpack") conv_block = prepare(conv_block) try: conv_block = convert(conv_block) except Exception as e: logging.info( "benchmark_conv3d_3x3x3_dw_bn_relu: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return func_to_benchmark_dummy try: traced_model = torch.jit.trace(conv_block, input_tensor, strict=False) except Exception as e: logging.info( "benchmark_conv3d_3x3x3_dw_bn_relu: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return func_to_benchmark_dummy if kwargs["quantize"] is False: traced_model = optimize_for_mobile(traced_model) logging.info(f"model arch: {traced_model}") def func_to_benchmark() -> None: try: _ = traced_model(input_tensor) except Exception as e: logging.info( "benchmark_conv3d_3x3x3_dw_bn_relu: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return return func_to_benchmark benchmark( _benchmark_conv3d_3x3x3_dw_bn_relu_forward, "benchmark_conv3d_3x3x3_dw_bn_relu", kwargs_list, num_iters=num_iters, warmup_iters=2, ) self.assertTrue(True) def test_benchmark_x3d_bottleneck_block(self, num_iters: int = 20) -> None: """ Benchmark X3dBottleneckBlock. Note efficient block X3dBottleneckBlock is designed for mobile cpu with qnnpack backend, and benchmarking on server/laptop may have different latency result compared to running on mobile cpu. Args: num_iters (int): number of iterations to perform benchmarking. """ torch.backends.quantized.engine = "qnnpack" kwargs_list = [ { "mode": "original", "input_blob_size": (1, 48, 4, 20, 20), "in_channels": 48, "mid_channels": 108, "out_channels": 48, "quantize": False, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 20, 20), "in_channels": 48, "mid_channels": 108, "out_channels": 48, "quantize": False, }, { "mode": "original", "input_blob_size": (1, 48, 4, 20, 20), "in_channels": 48, "mid_channels": 108, "out_channels": 48, "quantize": True, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 20, 20), "in_channels": 48, "mid_channels": 108, "out_channels": 48, "quantize": True, }, { "mode": "deployable", "input_blob_size": (1, 48, 4, 20, 20), "in_channels": 48, "mid_channels": 108, "out_channels": 48, "quantize": True, "native_conv3d_op_qnnpack": True, }, ] def _benchmark_x3d_bottleneck_forward(**kwargs) -> Callable: assert kwargs["mode"] in ("original", "deployable"), ( "kwargs['mode'] must be either 'original' or 'deployable'," "but got {}.".format(kwargs["mode"]) ) input_tensor = torch.randn((kwargs["input_blob_size"])) conv_block = X3dBottleneckBlock( kwargs["in_channels"], kwargs["mid_channels"], kwargs["out_channels"], use_bn=(False, False, False), # Assume BN has been fused for forward ) if kwargs["mode"] == "deployable": native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False) conv_block.convert( kwargs["input_blob_size"], convert_for_quantize=kwargs["quantize"], native_conv3d_op_qnnpack=native_conv3d_op_qnnpack, ) conv_block.eval() def func_to_benchmark_dummy() -> None: return if kwargs["quantize"] is True: conv_block = nn.Sequential( QuantStub(), conv_block, DeQuantStub(), ) conv_block.qconfig = get_default_qconfig("qnnpack") conv_block = prepare(conv_block) try: conv_block = convert(conv_block) traced_model = torch.jit.trace( conv_block, input_tensor, strict=False ) except Exception as e: logging.info( "benchmark_x3d_bottleneck_forward: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return func_to_benchmark_dummy try: traced_model = torch.jit.trace(conv_block, input_tensor, strict=False) except Exception as e: logging.info( "benchmark_x3d_bottleneck_forward: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return func_to_benchmark_dummy if kwargs["quantize"] is False: traced_model = optimize_for_mobile(traced_model) logging.info(f"model arch: {traced_model}") def func_to_benchmark() -> None: try: _ = traced_model(input_tensor) except Exception as e: logging.info( "benchmark_x3d_bottleneck_forward: " "catch exception '{}' with kwargs of {}".format(e, kwargs) ) return return func_to_benchmark benchmark( _benchmark_x3d_bottleneck_forward, "benchmark_x3d_bottleneck_forward", kwargs_list, num_iters=num_iters, warmup_iters=2, ) self.assertTrue(True)