/* * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "cutlass_extensions/include/cutlass_extensions/ft_gemm_configs.h" #include "utils/activation_types.h" #include namespace fastertransformer { /* This runner only supports: T in {half, __nv_bfloat} WeightType in {int8_t, cutlass::uint4b_t} Activations, biases, scales and outputs are all assumed to be row-major. However, it is assumed that B is in a special format governed by cutlass_extensions/gemm/kernel/mixed_gemm_B_layout. In this case, B must be preprocessed using the cutlass weight only quant preprocessors. The weight preprocessor will instantiate the layout and preprocess based on the instantiation, so layout changes should only require modifications to mix_gemm_B_layout.h. */ template class CutlassFpAIntBGemmRunner { public: CutlassFpAIntBGemmRunner(); ~CutlassFpAIntBGemmRunner(); void gemm(const T* A, const WeightType* B, const T* weight_scales, T* C, int m, int n, int k, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream); void gemm_bias_act(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C, int m, int n, int k, int bias_stride, ActivationType activation_type, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream); void gemm_bias_act_residual(const T *A, const WeightType *B, const T *weight_scales, const T *biases, const T *residual, T *C, int m, int n, int k, const std::string& activation, const std::string& binary_op, const std::string& unary_op, char *workspace_ptr, const size_t workspace_bytes, cudaStream_t stream); // Returns desired workspace size in bytes. int getWorkspaceSize(const int m, const int n, const int k); private: template void dispatch_to_arch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C, int m, int n, int k, int bias_stride, CutlassGemmConfig gemm_config, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr); template void run_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C, int m, int n, int k, int bias_stride, char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream); private: static constexpr int split_k_limit = 7; int sm_; int multi_processor_count_; }; } // namespace fastertransformer