1160 lines
36 KiB
Plaintext
1160 lines
36 KiB
Plaintext
/***************************************************************************************************
|
|
* Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
* list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
/* \file
|
|
\brief Helper functions for mapping CUTLASS concepts to cuBLAS.
|
|
*/
|
|
|
|
#include <stdexcept>
|
|
|
|
#if CUTLASS_ENABLE_CUBLAS
|
|
#include "cublas_helpers.h"
|
|
|
|
namespace cutlass {
|
|
namespace profiler {
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Converts a cuBLAS status to cutlass::Status
|
|
Status get_cutlass_status(cublasStatus_t cublas) {
|
|
|
|
switch (cublas) {
|
|
case CUBLAS_STATUS_SUCCESS:
|
|
return Status::kSuccess;
|
|
case CUBLAS_STATUS_INVALID_VALUE:
|
|
return Status::kErrorInvalidProblem;
|
|
case CUBLAS_STATUS_NOT_SUPPORTED:
|
|
return Status::kErrorNotSupported;
|
|
default: break;
|
|
}
|
|
return Status::kErrorInternal;
|
|
}
|
|
|
|
/// Converts a cuBLASS status to cutlass::profiler::Disposition
|
|
Disposition get_cutlass_disposition(cublasStatus_t cublas_status) {
|
|
|
|
if (cublas_status == CUBLAS_STATUS_INVALID_VALUE) {
|
|
return Disposition::kInvalidProblem;
|
|
}
|
|
else if (cublas_status == CUBLAS_STATUS_NOT_SUPPORTED) {
|
|
return Disposition::kNotSupported;
|
|
}
|
|
return Disposition::kFailed;
|
|
}
|
|
|
|
/// Maps a CUTLASS tensor layout to a cuBLAS transpose operation
|
|
bool get_cublas_transpose_operation(
|
|
cublasOperation_t &operation,
|
|
library::LayoutTypeID layout,
|
|
library::ComplexTransform transform) {
|
|
|
|
switch (layout) {
|
|
case library::LayoutTypeID::kColumnMajor:
|
|
if (transform == library::ComplexTransform::kNone) {
|
|
operation = CUBLAS_OP_N;
|
|
return true;
|
|
}
|
|
else {
|
|
return false;
|
|
}
|
|
break;
|
|
case library::LayoutTypeID::kRowMajor:
|
|
if (transform == library::ComplexTransform::kNone) {
|
|
operation = CUBLAS_OP_T;
|
|
return true;
|
|
}
|
|
else if (transform == library::ComplexTransform::kConjugate) {
|
|
operation = CUBLAS_OP_C;
|
|
return true;
|
|
}
|
|
break;
|
|
default: break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Maps a CUTLASS numeric type to a cuBLAS data type enumeration
|
|
bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID element_type) {
|
|
switch (element_type) {
|
|
case library::NumericTypeID::kF16:
|
|
data_type = CUDA_R_16F;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kBF16:
|
|
break;
|
|
|
|
case library::NumericTypeID::kTF32:
|
|
break;
|
|
|
|
case library::NumericTypeID::kF32:
|
|
data_type = CUDA_R_32F;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kF64:
|
|
data_type = CUDA_R_64F;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kS4:
|
|
break;
|
|
|
|
case library::NumericTypeID::kS8:
|
|
data_type = CUDA_R_8I;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kS16:
|
|
break;
|
|
|
|
case library::NumericTypeID::kS32:
|
|
data_type = CUDA_R_32I;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kS64:
|
|
break;
|
|
|
|
case library::NumericTypeID::kU4:
|
|
break;
|
|
|
|
case library::NumericTypeID::kU8:
|
|
data_type = CUDA_R_8U;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kU16:
|
|
break;
|
|
|
|
case library::NumericTypeID::kU32:
|
|
data_type = CUDA_R_32U;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kU64:
|
|
break;
|
|
|
|
case library::NumericTypeID::kB1:
|
|
break;
|
|
|
|
case library::NumericTypeID::kCF32:
|
|
data_type = CUDA_C_32F;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kCF64:
|
|
data_type = CUDA_C_64F;
|
|
return true;
|
|
|
|
case library::NumericTypeID::kInvalid:
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Maps a cutlass::SideMode to cuBLAS side mode
|
|
bool get_cublas_side_mode(cublasSideMode_t& side, SideMode side_mode) {
|
|
|
|
switch (side_mode) {
|
|
case SideMode::kLeft:
|
|
side = CUBLAS_SIDE_LEFT;
|
|
return true;
|
|
case SideMode::kRight:
|
|
side = CUBLAS_SIDE_RIGHT;
|
|
return true;
|
|
default: break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Maps a cutlass::FillMode to cuBLAS fill mode
|
|
bool get_cublas_fill_mode(cublasFillMode_t& uplo, FillMode fill_mode) {
|
|
|
|
switch (fill_mode) {
|
|
case FillMode::kLower:
|
|
uplo = CUBLAS_FILL_MODE_LOWER;
|
|
return true;
|
|
case FillMode::kUpper:
|
|
uplo = CUBLAS_FILL_MODE_UPPER;
|
|
return true;
|
|
default: break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Maps a cutlass::DiagType to cuBLAS diag type
|
|
bool get_cublas_diag_type(cublasDiagType_t& diag, DiagType diag_type) {
|
|
|
|
switch (diag_type) {
|
|
case DiagType::kNonUnit:
|
|
diag = CUBLAS_DIAG_NON_UNIT;
|
|
return true;
|
|
case DiagType::kUnit:
|
|
diag = CUBLAS_DIAG_UNIT;
|
|
return true;
|
|
default: break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Gets the cublas algorithm given threadblock tile dimensions and math opcode class
|
|
cublasGemmAlgo_t get_cublas_gemm_algo(int cta_m, int cta_n, int cta_k, library::OpcodeClassID opcode_class) {
|
|
return (opcode_class == library::OpcodeClassID::kSimt ?
|
|
CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Returns a status if cuBLAS can satisfy a particular GEMM description
|
|
Status cublas_satisfies(library::GemmDescription const &desc) {
|
|
auto const &math_instruction = desc.tile_description.math_instruction;
|
|
|
|
if (math_instruction.element_accumulator == library::NumericTypeID::kS32 &&
|
|
math_instruction.opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// output type S4 and S8 not supported in cuBLAS
|
|
if (desc.C.element == library::NumericTypeID::kS4 ||
|
|
desc.C.element == library::NumericTypeID::kS8) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
return Status::kSuccess;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
namespace detail {
|
|
|
|
cublasGemmExDispatcher::cublasGemmExDispatcher(
|
|
library::GemmDescription const &op_desc,
|
|
library::GemmUniversalConfiguration configuration_,
|
|
library::GemmUniversalArguments arguments_,
|
|
cublasGemmAlgo_t algorithm
|
|
):
|
|
configuration(configuration_), arguments(arguments_), algo(algorithm), status(Status::kSuccess) {
|
|
|
|
bool good = true;
|
|
|
|
good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
|
|
good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B));
|
|
good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
|
|
good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
|
|
good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
|
|
|
|
good = (good && get_cublas_datatype(
|
|
compute_data_type,
|
|
op_desc.tile_description.math_instruction.element_accumulator));
|
|
|
|
// cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
|
|
// internal numerical data types used in the computation.
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
library::OpcodeClassID const & opcode_class =
|
|
op_desc.tile_description.math_instruction.opcode_class;
|
|
|
|
if (good &&
|
|
op_desc.A.element == library::NumericTypeID::kF32 &&
|
|
op_desc.B.element == library::NumericTypeID::kF32 &&
|
|
opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
|
|
}
|
|
else if (good) {
|
|
bool const isPedantic = false;
|
|
switch (compute_data_type) {
|
|
case CUDA_R_32F:
|
|
case CUDA_C_32F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
|
|
break;
|
|
case CUDA_R_64F:
|
|
case CUDA_C_64F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
|
|
break;
|
|
case CUDA_R_16F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
|
|
break;
|
|
case CUDA_R_32I:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
|
|
break;
|
|
default:
|
|
good = false;
|
|
break;
|
|
}
|
|
}
|
|
#endif // __CUDACC_VER_MAJOR__ >= 11
|
|
|
|
if (!good) {
|
|
status = Status::kErrorNotSupported;
|
|
}
|
|
}
|
|
|
|
/// Executes GEMM using these arguments
|
|
cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) {
|
|
|
|
if (configuration.mode == library::GemmUniversalMode::kBatched) {
|
|
return cublasGemmStridedBatchedEx(
|
|
handle,
|
|
trans_A,
|
|
trans_B,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
arguments.alpha,
|
|
arguments.A,
|
|
data_type_A,
|
|
int(configuration.lda),
|
|
arguments.batch_stride_A,
|
|
arguments.B,
|
|
data_type_B,
|
|
int(configuration.ldb),
|
|
arguments.batch_stride_B,
|
|
arguments.beta,
|
|
arguments.D,
|
|
data_type_C,
|
|
int(configuration.ldc),
|
|
arguments.batch_stride_C,
|
|
configuration.batch_count,
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
compute_type,
|
|
#else
|
|
compute_data_type,
|
|
#endif
|
|
algo
|
|
);
|
|
}
|
|
else {
|
|
return cublasGemmEx(
|
|
handle,
|
|
trans_A,
|
|
trans_B,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
arguments.alpha,
|
|
arguments.A,
|
|
data_type_A,
|
|
int(configuration.lda),
|
|
arguments.B,
|
|
data_type_B,
|
|
int(configuration.ldb),
|
|
arguments.beta,
|
|
arguments.D,
|
|
data_type_C,
|
|
int(configuration.ldc),
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
compute_type,
|
|
#else
|
|
compute_data_type,
|
|
#endif
|
|
algo
|
|
);
|
|
}
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Returns a status if cuBLAS can satisfy a particular RankK description
|
|
Status cublas_satisfies(library::RankKDescription const &desc) {
|
|
auto const &math_instruction = desc.tile_description.math_instruction;
|
|
|
|
if (math_instruction.element_accumulator == library::NumericTypeID::kS32 &&
|
|
math_instruction.opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// output type S4 and S8 not supported in cuBLAS
|
|
if (desc.C.element == library::NumericTypeID::kS4 ||
|
|
desc.C.element == library::NumericTypeID::kS8) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// input type BF16 and TF32 not supported in cuBLAS
|
|
if (desc.A.element == library::NumericTypeID::kBF16 ||
|
|
desc.A.element == library::NumericTypeID::kTF32) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
return Status::kSuccess;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
namespace detail {
|
|
|
|
cublasRankKDispatcher::cublasRankKDispatcher(
|
|
library::RankKDescription const &op_desc,
|
|
library::RankKConfiguration configuration_,
|
|
library::RankKArguments arguments_
|
|
):
|
|
configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
|
|
|
|
blas_mode = op_desc.blas_mode;
|
|
num_ranks = op_desc.num_ranks;
|
|
|
|
bool good = true;
|
|
|
|
good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
|
|
good = (good && get_cublas_fill_mode(uplo, op_desc.fill_mode));
|
|
good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
|
|
good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
|
|
|
|
good = (good && get_cublas_datatype(
|
|
compute_data_type,
|
|
op_desc.tile_description.math_instruction.element_accumulator));
|
|
|
|
// cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
|
|
// internal numerical data types used in the computation.
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
library::OpcodeClassID const & opcode_class =
|
|
op_desc.tile_description.math_instruction.opcode_class;
|
|
|
|
if (good &&
|
|
op_desc.A.element == library::NumericTypeID::kF32 &&
|
|
opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
|
|
}
|
|
else if (good) {
|
|
bool const isPedantic = false;
|
|
switch (compute_data_type) {
|
|
case CUDA_R_32F:
|
|
case CUDA_C_32F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
|
|
break;
|
|
case CUDA_R_64F:
|
|
case CUDA_C_64F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
|
|
break;
|
|
case CUDA_R_16F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
|
|
break;
|
|
case CUDA_R_32I:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
|
|
break;
|
|
default:
|
|
good = false;
|
|
break;
|
|
}
|
|
}
|
|
#endif // __CUDACC_VER_MAJOR__ >= 11
|
|
|
|
if (!good) {
|
|
status = Status::kErrorNotSupported;
|
|
}
|
|
}
|
|
|
|
/// Executes RankK using these arguments
|
|
cublasStatus_t cublasRankKDispatcher::operator()(cublasHandle_t handle) {
|
|
|
|
// SYRK and HERK
|
|
if (num_ranks == 1) {
|
|
if (data_type_A == data_type_C && data_type_A == CUDA_R_64F) {
|
|
return cublasDsyrk(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const double*>(arguments.alpha),
|
|
static_cast<const double*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const double*>(arguments.beta),
|
|
static_cast<double*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_R_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
return cublasSsyrk(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const float*>(arguments.alpha),
|
|
static_cast<const float*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const float*>(arguments.beta),
|
|
static_cast<float*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_C_64F) {
|
|
|
|
if (blas_mode == BlasMode::kHermitian) {
|
|
return cublasZherk(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const double*>(arguments.alpha),
|
|
static_cast<const cuDoubleComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const double*>(arguments.beta),
|
|
static_cast<cuDoubleComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
else {
|
|
return cublasZsyrk(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const cuDoubleComplex*>(arguments.alpha),
|
|
static_cast<const cuDoubleComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuDoubleComplex*>(arguments.beta),
|
|
static_cast<cuDoubleComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_C_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
if (blas_mode == BlasMode::kHermitian) {
|
|
return cublasCherk(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const float*>(arguments.alpha),
|
|
static_cast<const cuComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const float*>(arguments.beta),
|
|
static_cast<cuComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
else {
|
|
return cublasCsyrk(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const cuComplex*>(arguments.alpha),
|
|
static_cast<const cuComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuComplex*>(arguments.beta),
|
|
static_cast<cuComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
} else {
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
}
|
|
}
|
|
|
|
// SYR2K and HER2K
|
|
else if (num_ranks == 2) {
|
|
if (data_type_A == data_type_C && data_type_A == CUDA_R_64F) {
|
|
return cublasDsyr2k(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const double*>(arguments.alpha),
|
|
static_cast<const double*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const double*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const double*>(arguments.beta),
|
|
static_cast<double*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_R_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
return cublasSsyr2k(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const float*>(arguments.alpha),
|
|
static_cast<const float*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const float*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const float*>(arguments.beta),
|
|
static_cast<float*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_C_64F) {
|
|
|
|
if (blas_mode == BlasMode::kHermitian) {
|
|
return cublasZher2k(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const cuDoubleComplex*>(arguments.alpha),
|
|
static_cast<const cuDoubleComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuDoubleComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const double*>(arguments.beta),
|
|
static_cast<cuDoubleComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
else {
|
|
return cublasZsyr2k(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const cuDoubleComplex*>(arguments.alpha),
|
|
static_cast<const cuDoubleComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuDoubleComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const cuDoubleComplex*>(arguments.beta),
|
|
static_cast<cuDoubleComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_C_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
if (blas_mode == BlasMode::kHermitian) {
|
|
return cublasCher2k(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const cuComplex*>(arguments.alpha),
|
|
static_cast<const cuComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const float*>(arguments.beta),
|
|
static_cast<cuComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
else {
|
|
return cublasCsyr2k(
|
|
handle,
|
|
uplo,
|
|
trans_A,
|
|
configuration.problem_size.n(),
|
|
configuration.problem_size.k(),
|
|
static_cast<const cuComplex*>(arguments.alpha),
|
|
static_cast<const cuComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const cuComplex*>(arguments.beta),
|
|
static_cast<cuComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
} else {
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
}
|
|
}
|
|
else {
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
}
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Returns a status if cuBLAS can satisfy a particular TRMM description
|
|
Status cublas_satisfies(library::TrmmDescription const &desc) {
|
|
auto const &math_instruction = desc.tile_description.math_instruction;
|
|
|
|
if (math_instruction.element_accumulator == library::NumericTypeID::kS32 &&
|
|
math_instruction.opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// output type S4 and S8 not supported in cuBLAS
|
|
if (desc.D.element == library::NumericTypeID::kS4 ||
|
|
desc.D.element == library::NumericTypeID::kS8) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// input type BF16 and TF32 not supported in cuBLAS
|
|
if (desc.A.element == library::NumericTypeID::kBF16 ||
|
|
desc.A.element == library::NumericTypeID::kTF32) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
return Status::kSuccess;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
namespace detail {
|
|
|
|
cublasTrmmDispatcher::cublasTrmmDispatcher(
|
|
library::TrmmDescription const &op_desc,
|
|
library::TrmmConfiguration configuration_,
|
|
library::TrmmArguments arguments_
|
|
):
|
|
configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
|
|
|
|
bool good = true;
|
|
|
|
good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
|
|
good = (good && get_cublas_side_mode(side, op_desc.side_mode));
|
|
good = (good && get_cublas_fill_mode(uplo, op_desc.fill_mode));
|
|
good = (good && get_cublas_diag_type(diag, op_desc.diag_type));
|
|
good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
|
|
good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
|
|
good = (good && get_cublas_datatype(data_type_D, op_desc.D.element));
|
|
|
|
// if A is Transposed, then for cuBLAS that is inverted Fill Mode.
|
|
if (trans_A == CUBLAS_OP_T || trans_A == CUBLAS_OP_C) {
|
|
if (uplo == CUBLAS_FILL_MODE_LOWER)
|
|
uplo = CUBLAS_FILL_MODE_UPPER;
|
|
else
|
|
uplo = CUBLAS_FILL_MODE_LOWER;
|
|
}
|
|
|
|
good = (good && get_cublas_datatype(
|
|
compute_data_type,
|
|
op_desc.tile_description.math_instruction.element_accumulator));
|
|
|
|
// cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
|
|
// internal numerical data types used in the computation.
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
library::OpcodeClassID const & opcode_class =
|
|
op_desc.tile_description.math_instruction.opcode_class;
|
|
|
|
if (good &&
|
|
op_desc.A.element == library::NumericTypeID::kF32 &&
|
|
opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
|
|
}
|
|
else if (good) {
|
|
bool const isPedantic = false;
|
|
switch (compute_data_type) {
|
|
case CUDA_R_32F:
|
|
case CUDA_C_32F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
|
|
break;
|
|
case CUDA_R_64F:
|
|
case CUDA_C_64F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
|
|
break;
|
|
case CUDA_R_16F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
|
|
break;
|
|
case CUDA_R_32I:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
|
|
break;
|
|
default:
|
|
good = false;
|
|
break;
|
|
}
|
|
}
|
|
#endif // __CUDACC_VER_MAJOR__ >= 11
|
|
|
|
if (!good) {
|
|
status = Status::kErrorNotSupported;
|
|
}
|
|
}
|
|
|
|
/// Executes TRMM using these arguments
|
|
cublasStatus_t cublasTrmmDispatcher::operator()(cublasHandle_t handle) {
|
|
|
|
if (data_type_A == data_type_D && data_type_A == CUDA_R_64F) {
|
|
return cublasDtrmm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
trans_A,
|
|
diag,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const double*>(arguments.alpha),
|
|
static_cast<const double*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const double*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<double*>(arguments.D),
|
|
int(configuration.ldd)
|
|
);
|
|
} else if (data_type_A == data_type_D && data_type_A == CUDA_R_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
return cublasStrmm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
trans_A,
|
|
diag,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const float*>(arguments.alpha),
|
|
static_cast<const float*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const float*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<float*>(arguments.D),
|
|
int(configuration.ldd)
|
|
);
|
|
} else if (data_type_A == data_type_D && data_type_A == CUDA_C_64F) {
|
|
return cublasZtrmm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
trans_A,
|
|
diag,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const cuDoubleComplex*>(arguments.alpha),
|
|
static_cast<const cuDoubleComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuDoubleComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<cuDoubleComplex*>(arguments.D),
|
|
int(configuration.ldd)
|
|
);
|
|
} else if (data_type_A == data_type_D && data_type_A == CUDA_C_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
return cublasCtrmm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
trans_A,
|
|
diag,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const cuComplex*>(arguments.alpha),
|
|
static_cast<const cuComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<cuComplex*>(arguments.D),
|
|
int(configuration.ldd)
|
|
);
|
|
} else {
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
}
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Returns a status if cuBLAS can satisfy a particular Symm description
|
|
Status cublas_satisfies(library::SymmDescription const &desc) {
|
|
auto const &math_instruction = desc.tile_description.math_instruction;
|
|
|
|
if (math_instruction.element_accumulator == library::NumericTypeID::kS32 &&
|
|
math_instruction.opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// output type S4 and S8 not supported in cuBLAS
|
|
if (desc.C.element == library::NumericTypeID::kS4 ||
|
|
desc.C.element == library::NumericTypeID::kS8) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// input type BF16 and TF32 not supported in cuBLAS
|
|
if (desc.A.element == library::NumericTypeID::kBF16 ||
|
|
desc.A.element == library::NumericTypeID::kTF32) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// input type BF16 and TF32 not supported in cuBLAS
|
|
if (desc.B.element == library::NumericTypeID::kBF16 ||
|
|
desc.B.element == library::NumericTypeID::kTF32) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
// only column major layout is supported in cuBLAS
|
|
if (desc.A.layout != library::LayoutTypeID::kColumnMajor ||
|
|
desc.transform_A != library::ComplexTransform::kNone) {
|
|
|
|
return Status::kErrorNotSupported;
|
|
}
|
|
|
|
return Status::kSuccess;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
namespace detail {
|
|
|
|
cublasSymmDispatcher::cublasSymmDispatcher(
|
|
library::SymmDescription const &op_desc,
|
|
library::SymmConfiguration configuration_,
|
|
library::SymmArguments arguments_
|
|
):
|
|
configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
|
|
|
|
blas_mode = op_desc.blas_mode;
|
|
|
|
bool good = true;
|
|
|
|
good = (good && get_cublas_side_mode(side, op_desc.side_mode));
|
|
good = (good && get_cublas_fill_mode(uplo, op_desc.fill_mode));
|
|
good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
|
|
good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
|
|
|
|
good = (good && get_cublas_datatype(
|
|
compute_data_type,
|
|
op_desc.tile_description.math_instruction.element_accumulator));
|
|
|
|
// cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
|
|
// internal numerical data types used in the computation.
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
library::OpcodeClassID const & opcode_class =
|
|
op_desc.tile_description.math_instruction.opcode_class;
|
|
|
|
if (good &&
|
|
op_desc.A.element == library::NumericTypeID::kF32 &&
|
|
opcode_class == library::OpcodeClassID::kTensorOp) {
|
|
|
|
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
|
|
}
|
|
else if (good) {
|
|
bool const isPedantic = false;
|
|
switch (compute_data_type) {
|
|
case CUDA_R_32F:
|
|
case CUDA_C_32F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
|
|
break;
|
|
case CUDA_R_64F:
|
|
case CUDA_C_64F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
|
|
break;
|
|
case CUDA_R_16F:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
|
|
break;
|
|
case CUDA_R_32I:
|
|
compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
|
|
break;
|
|
default:
|
|
good = false;
|
|
break;
|
|
}
|
|
}
|
|
#endif // __CUDACC_VER_MAJOR__ >= 11
|
|
|
|
if (!good) {
|
|
status = Status::kErrorNotSupported;
|
|
}
|
|
}
|
|
|
|
/// Executes Symm using these arguments
|
|
cublasStatus_t cublasSymmDispatcher::operator()(cublasHandle_t handle) {
|
|
|
|
// SYMM and HEMM
|
|
if (data_type_A == data_type_C && data_type_A == CUDA_R_64F) {
|
|
return cublasDsymm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const double*>(arguments.alpha),
|
|
static_cast<const double*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const double*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const double*>(arguments.beta),
|
|
static_cast<double*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_R_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
return cublasSsymm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const float*>(arguments.alpha),
|
|
static_cast<const float*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const float*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const float*>(arguments.beta),
|
|
static_cast<float*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_C_64F) {
|
|
|
|
if (blas_mode == BlasMode::kHermitian) {
|
|
return cublasZhemm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const cuDoubleComplex*>(arguments.alpha),
|
|
static_cast<const cuDoubleComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuDoubleComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const cuDoubleComplex*>(arguments.beta),
|
|
static_cast<cuDoubleComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
else {
|
|
return cublasZsymm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const cuDoubleComplex*>(arguments.alpha),
|
|
static_cast<const cuDoubleComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuDoubleComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const cuDoubleComplex*>(arguments.beta),
|
|
static_cast<cuDoubleComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
|
|
} else if (data_type_A == data_type_C && data_type_A == CUDA_C_32F) {
|
|
|
|
#if (__CUDACC_VER_MAJOR__ >= 11)
|
|
if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
#endif
|
|
|
|
if (blas_mode == BlasMode::kHermitian) {
|
|
return cublasChemm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const cuComplex*>(arguments.alpha),
|
|
static_cast<const cuComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const cuComplex*>(arguments.beta),
|
|
static_cast<cuComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
else {
|
|
return cublasCsymm(
|
|
handle,
|
|
side,
|
|
uplo,
|
|
configuration.problem_size.m(),
|
|
configuration.problem_size.n(),
|
|
static_cast<const cuComplex*>(arguments.alpha),
|
|
static_cast<const cuComplex*>(arguments.A),
|
|
int(configuration.lda),
|
|
static_cast<const cuComplex*>(arguments.B),
|
|
int(configuration.ldb),
|
|
static_cast<const cuComplex*>(arguments.beta),
|
|
static_cast<cuComplex*>(arguments.D),
|
|
int(configuration.ldc)
|
|
);
|
|
}
|
|
} else {
|
|
return CUBLAS_STATUS_NOT_SUPPORTED;
|
|
}
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
} // namespace profiler
|
|
} // namespace cutlass
|
|
|
|
#endif // #if CUTLASS_ENABLE_CUBLAS
|