/* coding=utf-8 * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include "type_shim.h" #include #include #include #include #include namespace { // Hard-coded hyperparameters // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4; constexpr int BUFFER_SIZE = 32; constexpr int FILTER_SIZE = 12; constexpr int HALF_FILTER_SIZE = 6; constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl template __global__ void anti_alias_activation_forward( output_t *dst, const input_t *src, const input_t *up_ftr, const input_t *down_ftr, const input_t *alpha, const input_t *beta, int batch_size, int channels, int seq_len) { // Up and downsample filters input_t up_filter[FILTER_SIZE]; input_t down_filter[FILTER_SIZE]; // Load data from global memory including extra indices reserved for replication paddings input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0}; input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0}; // Output stores downsampled output before writing to dst output_t output[BUFFER_SIZE]; // blockDim/threadIdx = (128, 1, 1) // gridDim/blockIdx = (seq_blocks, channels, batches) int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z)); int local_offset = threadIdx.x * BUFFER_SIZE; int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset; // intermediate have double the seq_len int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2; int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset; // Get values needed for replication padding before moving pointer const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z)); input_t seq_left_most_value = right_most_pntr[0]; input_t seq_right_most_value = right_most_pntr[seq_len - 1]; // Move src and dst pointers src += block_offset + local_offset; dst += block_offset + local_offset; // Alpha and beta values for snake activatons. Applies exp by default alpha = alpha + blockIdx.y; input_t alpha_val = expf(alpha[0]); beta = beta + blockIdx.y; input_t beta_val = expf(beta[0]); #pragma unroll for (int it = 0; it < FILTER_SIZE; it += 1) { up_filter[it] = up_ftr[it]; down_filter[it] = down_ftr[it]; } // Apply replication padding for upsampling, matching torch impl #pragma unroll for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1) { int element_index = seq_offset + it; // index for element if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD)) { elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value; } if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD)) { elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value; } if ((element_index >= 0) && (element_index < seq_len)) { elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it]; } } // Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later #pragma unroll for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1) { input_t acc = 0.0; int element_index = intermediate_seq_offset + it; // index for intermediate #pragma unroll for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1) { if ((element_index + f_idx) >= 0) { acc += up_filter[f_idx] * elements[it + f_idx]; } } intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc; } // Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later double no_div_by_zero = 0.000000001; #pragma unroll for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1) { intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val); } // Apply replication padding before downsampling conv from intermediates #pragma unroll for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1) { intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT]; } #pragma unroll for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1) { intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1]; } // Apply downsample strided convolution (assuming stride=2) from intermediates #pragma unroll for (int it = 0; it < BUFFER_SIZE; it += 1) { input_t acc = 0.0; #pragma unroll for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1) { // Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT]; } output[it] = acc; } // Write output to dst #pragma unroll for (int it = 0; it < BUFFER_SIZE; it += ELEMENTS_PER_LDG_STG) { int element_index = seq_offset + it; if (element_index < seq_len) { dst[it] = output[it]; } } } template void dispatch_anti_alias_activation_forward( output_t *dst, const input_t *src, const input_t *up_ftr, const input_t *down_ftr, const input_t *alpha, const input_t *beta, int batch_size, int channels, int seq_len) { if (seq_len == 0) { return; } else { // Use 128 threads per block to maximimize gpu utilization constexpr int threads_per_block = 128; constexpr int seq_len_per_block = 4096; int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block; dim3 blocks(blocks_per_seq_len, channels, batch_size); dim3 threads(threads_per_block, 1, 1); anti_alias_activation_forward <<>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len); } } } extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta) { // Input is a 3d tensor with dimensions [batches, channels, seq_len] const int batches = input.size(0); const int channels = input.size(1); const int seq_len = input.size(2); // Output auto act_options = input.options().requires_grad(false); torch::Tensor anti_alias_activation_results = torch::empty({batches, channels, seq_len}, act_options); void *input_ptr = static_cast(input.data_ptr()); void *up_filter_ptr = static_cast(up_filter.data_ptr()); void *down_filter_ptr = static_cast(down_filter.data_ptr()); void *alpha_ptr = static_cast(alpha.data_ptr()); void *beta_ptr = static_cast(beta.data_ptr()); void *anti_alias_activation_results_ptr = static_cast(anti_alias_activation_results.data_ptr()); DISPATCH_FLOAT_HALF_AND_BFLOAT( input.scalar_type(), "dispatch anti alias activation_forward", dispatch_anti_alias_activation_forward( reinterpret_cast(anti_alias_activation_results_ptr), reinterpret_cast(input_ptr), reinterpret_cast(up_filter_ptr), reinterpret_cast(down_filter_ptr), reinterpret_cast(alpha_ptr), reinterpret_cast(beta_ptr), batches, channels, seq_len);); return anti_alias_activation_results; }