923 lines
34 KiB
Plaintext
923 lines
34 KiB
Plaintext
// Adapted from https://github.com/NVIDIA/apex/blob/master/csrc/fused_dense_cuda.cu
|
|
#include <ATen/ATen.h>
|
|
#include <ATen/cuda/CUDAContext.h>
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <torch/torch.h>
|
|
|
|
/* Includes, cuda */
|
|
#include <cublas_v2.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
|
|
// includes cublaslt
|
|
#include <cublasLt.h>
|
|
#endif
|
|
|
|
// FP16 Tensor core wrapper around cublas GEMMEx
|
|
cublasStatus_t gemm_bias(
|
|
cublasHandle_t handle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float* alpha,
|
|
at::Half* A,
|
|
int lda,
|
|
at::Half* B,
|
|
int ldb,
|
|
const float* beta,
|
|
at::Half* C,
|
|
int ldc) {
|
|
return cublasGemmEx(
|
|
handle,
|
|
transa,
|
|
transb,
|
|
m,
|
|
n,
|
|
k,
|
|
alpha,
|
|
A,
|
|
CUDA_R_16F,
|
|
lda,
|
|
B,
|
|
CUDA_R_16F,
|
|
ldb,
|
|
beta,
|
|
C,
|
|
CUDA_R_16F,
|
|
ldc,
|
|
CUDA_R_32F,
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
|
|
}
|
|
|
|
// BF16 Tensor core wrapper around cublas GEMMEx
|
|
cublasStatus_t gemm_bias(
|
|
cublasHandle_t handle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float* alpha,
|
|
at::BFloat16* A,
|
|
int lda,
|
|
at::BFloat16* B,
|
|
int ldb,
|
|
const float* beta,
|
|
at::BFloat16* C,
|
|
int ldc) {
|
|
return cublasGemmEx(
|
|
handle,
|
|
transa,
|
|
transb,
|
|
m,
|
|
n,
|
|
k,
|
|
alpha,
|
|
A,
|
|
CUDA_R_16BF,
|
|
lda,
|
|
B,
|
|
CUDA_R_16BF,
|
|
ldb,
|
|
beta,
|
|
C,
|
|
CUDA_R_16BF,
|
|
ldc,
|
|
CUDA_R_32F,
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
|
|
}
|
|
|
|
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
|
|
|
|
int gemm_bias_gelu_lt(
|
|
cublasLtHandle_t ltHandle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float *alpha, /* host pointer */
|
|
at::Half* A,
|
|
int lda,
|
|
at::Half* B,
|
|
int ldb,
|
|
const float *beta, /* host pointer */
|
|
at::Half* C,
|
|
int64_t ldc,
|
|
void *workspace,
|
|
size_t workspaceSize,
|
|
cudaStream_t stream,
|
|
int heuristic,
|
|
const void* gelu_in,
|
|
const void* bias) {
|
|
bool save_gelu_in = gelu_in != nullptr;
|
|
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
|
|
|
cublasLtMatmulDescOpaque_t operationDesc = {};
|
|
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
|
|
cublasLtMatmulPreferenceOpaque_t preference = {};
|
|
|
|
int returnedResults = 0;
|
|
constexpr int requestedAlgoCount = 5;
|
|
cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0};
|
|
cublasLtEpilogue_t epilogue = save_gelu_in ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU;
|
|
|
|
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
|
|
// for details about defaults; here we just set the transforms for
|
|
// A and B.
|
|
status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (save_gelu_in) {
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in));
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));
|
|
}
|
|
|
|
if (bias != nullptr) {
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
epilogue = save_gelu_in ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS : CUBLASLT_EPILOGUE_GELU_BIAS;
|
|
} else {
|
|
epilogue = save_gelu_in ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU;
|
|
}
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
|
|
// Create matrix descriptors. Not setting any extra attributes.
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// Create preference handle; In general, extra attributes can be
|
|
// used here to disable tensor ops or to make sure algo selected
|
|
// will work with badly aligned A, B, C. However, for simplicity
|
|
// here we assume A,B,C are always well aligned (e.g., directly
|
|
// come from cudaMalloc)
|
|
status = cublasLtMatmulPreferenceInit(&preference);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulPreferenceSetAttribute(
|
|
&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// We just need the best available heuristic to try and run matmul.
|
|
// There is no guarantee that this will work. For example, if A is
|
|
// badly aligned, you can request more (e.g. 32) algos and try to
|
|
// run them one by one until something works.
|
|
status = cublasLtMatmulAlgoGetHeuristic(
|
|
ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (returnedResults == 0) {
|
|
status = CUBLAS_STATUS_NOT_SUPPORTED;
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmul(ltHandle,
|
|
&operationDesc,
|
|
alpha,
|
|
A,
|
|
&Adesc,
|
|
B,
|
|
&Bdesc,
|
|
beta,
|
|
C,
|
|
&Cdesc,
|
|
C,
|
|
&Cdesc,
|
|
// &heuristicResult.algo,
|
|
// TD [2022-04-29] Somehow algo 0 and 2 are a lot slower than other algos
|
|
&heuristicResult[heuristic].algo,
|
|
// NULL,
|
|
workspace,
|
|
workspaceSize,
|
|
stream);
|
|
|
|
CLEANUP:
|
|
// Descriptors are no longer needed as all GPU work was already
|
|
// enqueued.
|
|
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
|
|
}
|
|
|
|
int gemm_bias_gelu_lt(
|
|
cublasLtHandle_t ltHandle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float *alpha, /* host pointer */
|
|
at::BFloat16* A,
|
|
int lda,
|
|
at::BFloat16* B,
|
|
int ldb,
|
|
const float *beta, /* host pointer */
|
|
at::BFloat16* C,
|
|
int64_t ldc,
|
|
void *workspace,
|
|
size_t workspaceSize,
|
|
cudaStream_t stream,
|
|
int heuristic,
|
|
const void* gelu_in,
|
|
const void* bias) {
|
|
bool save_gelu_in = gelu_in != nullptr;
|
|
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
|
|
|
cublasLtMatmulDescOpaque_t operationDesc = {};
|
|
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
|
|
cublasLtMatmulPreferenceOpaque_t preference = {};
|
|
|
|
int returnedResults = 0;
|
|
constexpr int requestedAlgoCount = 5;
|
|
cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0};
|
|
cublasLtEpilogue_t epilogue = save_gelu_in ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU;
|
|
|
|
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
|
|
// for details about defaults; here we just set the transforms for
|
|
// A and B.
|
|
status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (save_gelu_in) {
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in));
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));
|
|
}
|
|
|
|
if (bias != nullptr) {
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
epilogue = save_gelu_in ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS : CUBLASLT_EPILOGUE_GELU_BIAS;
|
|
} else {
|
|
epilogue = save_gelu_in ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU;
|
|
}
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
|
|
// Create matrix descriptors. Not setting any extra attributes.
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// Create preference handle; In general, extra attributes can be
|
|
// used here to disable tensor ops or to make sure algo selected
|
|
// will work with badly aligned A, B, C. However, for simplicity
|
|
// here we assume A,B,C are always well aligned (e.g., directly
|
|
// come from cudaMalloc)
|
|
status = cublasLtMatmulPreferenceInit(&preference);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulPreferenceSetAttribute(
|
|
&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// We just need the best available heuristic to try and run matmul.
|
|
// There is no guarantee that this will work. For example, if A is
|
|
// badly aligned, you can request more (e.g. 32) algos and try to
|
|
// run them one by one until something works.
|
|
status = cublasLtMatmulAlgoGetHeuristic(
|
|
ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (returnedResults == 0) {
|
|
status = CUBLAS_STATUS_NOT_SUPPORTED;
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmul(ltHandle,
|
|
&operationDesc,
|
|
alpha,
|
|
A,
|
|
&Adesc,
|
|
B,
|
|
&Bdesc,
|
|
beta,
|
|
C,
|
|
&Cdesc,
|
|
C,
|
|
&Cdesc,
|
|
// &heuristicResult.algo,
|
|
// TD [2022-04-29] Somehow algo 0 and 2 are a lot slower than other algos
|
|
&heuristicResult[heuristic].algo,
|
|
// NULL,
|
|
workspace,
|
|
workspaceSize,
|
|
stream);
|
|
|
|
CLEANUP:
|
|
// Descriptors are no longer needed as all GPU work was already
|
|
// enqueued.
|
|
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
|
|
}
|
|
|
|
int gemm_bgradb_lt(
|
|
cublasLtHandle_t ltHandle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float *alpha, /* host pointer */
|
|
at::Half* A,
|
|
int lda,
|
|
at::Half* B,
|
|
int ldb,
|
|
const float *beta, /* host pointer */
|
|
at::Half* C,
|
|
int ldc,
|
|
void *workspace,
|
|
size_t workspaceSize,
|
|
cudaStream_t stream,
|
|
const void* bgrad) {
|
|
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
|
|
|
cublasLtMatmulDescOpaque_t operationDesc = {};
|
|
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
|
|
cublasLtMatmulPreferenceOpaque_t preference = {};
|
|
|
|
int returnedResults = 0;
|
|
cublasLtMatmulHeuristicResult_t heuristicResult = {};
|
|
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
|
|
|
|
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
|
|
// for details about defaults; here we just set the transforms for
|
|
// A and B.
|
|
status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (bgrad != nullptr) {
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
epilogue = CUBLASLT_EPILOGUE_BGRADB;
|
|
}
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
|
|
// Create matrix descriptors. Not setting any extra attributes.
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// Create preference handle; In general, extra attributes can be
|
|
// used here to disable tensor ops or to make sure algo selected
|
|
// will work with badly aligned A, B, C. However, for simplicity
|
|
// here we assume A,B,C are always well aligned (e.g., directly
|
|
// come from cudaMalloc)
|
|
status = cublasLtMatmulPreferenceInit(&preference);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulPreferenceSetAttribute(
|
|
&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// We just need the best available heuristic to try and run matmul.
|
|
// There is no guarantee that this will work. For example, if A is
|
|
// badly aligned, you can request more (e.g. 32) algos and try to
|
|
// run them one by one until something works.
|
|
status = cublasLtMatmulAlgoGetHeuristic(
|
|
ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (returnedResults == 0) {
|
|
status = CUBLAS_STATUS_NOT_SUPPORTED;
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmul(ltHandle,
|
|
&operationDesc,
|
|
alpha,
|
|
A,
|
|
&Adesc,
|
|
B,
|
|
&Bdesc,
|
|
beta,
|
|
C,
|
|
&Cdesc,
|
|
C,
|
|
&Cdesc,
|
|
//&heuristicResult.algo,
|
|
NULL,
|
|
workspace,
|
|
workspaceSize,
|
|
stream);
|
|
|
|
CLEANUP:
|
|
// Descriptors are no longer needed as all GPU work was already
|
|
// enqueued.
|
|
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
|
|
}
|
|
|
|
int gemm_bgradb_lt(
|
|
cublasLtHandle_t ltHandle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float *alpha, /* host pointer */
|
|
at::BFloat16* A,
|
|
int lda,
|
|
at::BFloat16* B,
|
|
int ldb,
|
|
const float *beta, /* host pointer */
|
|
at::BFloat16* C,
|
|
int ldc,
|
|
void *workspace,
|
|
size_t workspaceSize,
|
|
cudaStream_t stream,
|
|
const void* bgrad) {
|
|
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
|
|
|
cublasLtMatmulDescOpaque_t operationDesc = {};
|
|
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
|
|
cublasLtMatmulPreferenceOpaque_t preference = {};
|
|
|
|
int returnedResults = 0;
|
|
cublasLtMatmulHeuristicResult_t heuristicResult = {};
|
|
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
|
|
|
|
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
|
|
// for details about defaults; here we just set the transforms for
|
|
// A and B.
|
|
status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (bgrad != nullptr) {
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
epilogue = CUBLASLT_EPILOGUE_BGRADB;
|
|
}
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
|
|
// Create matrix descriptors. Not setting any extra attributes.
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// Create preference handle; In general, extra attributes can be
|
|
// used here to disable tensor ops or to make sure algo selected
|
|
// will work with badly aligned A, B, C. However, for simplicity
|
|
// here we assume A,B,C are always well aligned (e.g., directly
|
|
// come from cudaMalloc)
|
|
status = cublasLtMatmulPreferenceInit(&preference);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulPreferenceSetAttribute(
|
|
&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// We just need the best available heuristic to try and run matmul.
|
|
// There is no guarantee that this will work. For example, if A is
|
|
// badly aligned, you can request more (e.g. 32) algos and try to
|
|
// run them one by one until something works.
|
|
status = cublasLtMatmulAlgoGetHeuristic(
|
|
ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (returnedResults == 0) {
|
|
status = CUBLAS_STATUS_NOT_SUPPORTED;
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmul(ltHandle,
|
|
&operationDesc,
|
|
alpha,
|
|
A,
|
|
&Adesc,
|
|
B,
|
|
&Bdesc,
|
|
beta,
|
|
C,
|
|
&Cdesc,
|
|
C,
|
|
&Cdesc,
|
|
//&heuristicResult.algo,
|
|
NULL,
|
|
workspace,
|
|
workspaceSize,
|
|
stream);
|
|
|
|
CLEANUP:
|
|
// Descriptors are no longer needed as all GPU work was already
|
|
// enqueued.
|
|
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
|
|
}
|
|
|
|
int gemm_dgelu_bgradb_lt(
|
|
cublasLtHandle_t ltHandle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float *alpha, /* host pointer */
|
|
at::Half* A,
|
|
int lda,
|
|
at::Half* B,
|
|
int ldb,
|
|
const float *beta, /* host pointer */
|
|
at::Half* C,
|
|
int64_t ldc,
|
|
void *workspace,
|
|
size_t workspaceSize,
|
|
cudaStream_t stream,
|
|
int heuristic,
|
|
const void *gelu_in,
|
|
const void *bgrad) {
|
|
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
|
|
|
cublasLtMatmulDescOpaque_t operationDesc = {};
|
|
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
|
|
cublasLtMatmulPreferenceOpaque_t preference = {};
|
|
|
|
int returnedResults = 0;
|
|
constexpr int requestedAlgoCount = 5;
|
|
cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0};
|
|
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD;
|
|
|
|
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
|
|
// for details about defaults; here we just set the transforms for
|
|
// A and B.
|
|
status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
|
|
// Create matrix descriptors. Not setting any extra attributes.
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// Create preference handle; In general, extra attributes can be
|
|
// used here to disable tensor ops or to make sure algo selected
|
|
// will work with badly aligned A, B, C. However, for simplicity
|
|
// here we assume A,B,C are always well aligned (e.g., directly
|
|
// come from cudaMalloc)
|
|
status = cublasLtMatmulPreferenceInit(&preference);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulPreferenceSetAttribute(
|
|
&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// We just need the best available heuristic to try and run matmul.
|
|
// There is no guarantee that this will work. For example, if A is
|
|
// badly aligned, you can request more (e.g. 32) algos and try to
|
|
// run them one by one until something works.
|
|
status = cublasLtMatmulAlgoGetHeuristic(
|
|
ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (returnedResults == 0) {
|
|
status = CUBLAS_STATUS_NOT_SUPPORTED;
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmul(ltHandle,
|
|
&operationDesc,
|
|
alpha,
|
|
A,
|
|
&Adesc,
|
|
B,
|
|
&Bdesc,
|
|
beta,
|
|
C,
|
|
&Cdesc,
|
|
C,
|
|
&Cdesc,
|
|
//&heuristicResult.algo,
|
|
&heuristicResult[heuristic].algo,
|
|
// NULL,
|
|
workspace,
|
|
workspaceSize,
|
|
stream);
|
|
|
|
CLEANUP:
|
|
// Descriptors are no longer needed as all GPU work was already
|
|
// enqueued.
|
|
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
|
|
}
|
|
|
|
int gemm_dgelu_bgradb_lt(
|
|
cublasLtHandle_t ltHandle,
|
|
cublasOperation_t transa,
|
|
cublasOperation_t transb,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const float *alpha, /* host pointer */
|
|
at::BFloat16* A,
|
|
int lda,
|
|
at::BFloat16* B,
|
|
int ldb,
|
|
const float *beta, /* host pointer */
|
|
at::BFloat16* C,
|
|
int64_t ldc,
|
|
void *workspace,
|
|
size_t workspaceSize,
|
|
cudaStream_t stream,
|
|
int heuristic,
|
|
const void *gelu_in,
|
|
const void *bgrad) {
|
|
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
|
|
|
cublasLtMatmulDescOpaque_t operationDesc = {};
|
|
cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
|
|
cublasLtMatmulPreferenceOpaque_t preference = {};
|
|
|
|
int returnedResults = 0;
|
|
constexpr int requestedAlgoCount = 5;
|
|
cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0};
|
|
cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD;
|
|
|
|
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
|
|
// for details about defaults; here we just set the transforms for
|
|
// A and B.
|
|
status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));
|
|
|
|
status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
goto CLEANUP;
|
|
}
|
|
|
|
// Create matrix descriptors. Not setting any extra attributes.
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(
|
|
&Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// Create preference handle; In general, extra attributes can be
|
|
// used here to disable tensor ops or to make sure algo selected
|
|
// will work with badly aligned A, B, C. However, for simplicity
|
|
// here we assume A,B,C are always well aligned (e.g., directly
|
|
// come from cudaMalloc)
|
|
status = cublasLtMatmulPreferenceInit(&preference);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
status = cublasLtMatmulPreferenceSetAttribute(
|
|
&preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
// We just need the best available heuristic to try and run matmul.
|
|
// There is no guarantee that this will work. For example, if A is
|
|
// badly aligned, you can request more (e.g. 32) algos and try to
|
|
// run them one by one until something works.
|
|
status = cublasLtMatmulAlgoGetHeuristic(
|
|
ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults);
|
|
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
|
|
|
|
if (returnedResults == 0) {
|
|
status = CUBLAS_STATUS_NOT_SUPPORTED;
|
|
goto CLEANUP;
|
|
}
|
|
status = cublasLtMatmul(ltHandle,
|
|
&operationDesc,
|
|
alpha,
|
|
A,
|
|
&Adesc,
|
|
B,
|
|
&Bdesc,
|
|
beta,
|
|
C,
|
|
&Cdesc,
|
|
C,
|
|
&Cdesc,
|
|
//&heuristicResult.algo,
|
|
&heuristicResult[heuristic].algo,
|
|
// NULL,
|
|
workspace,
|
|
workspaceSize,
|
|
stream);
|
|
|
|
CLEANUP:
|
|
// Descriptors are no longer needed as all GPU work was already
|
|
// enqueued.
|
|
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
|
|
}
|
|
|
|
#endif
|
|
|
|
template <typename T>
|
|
int linear_bias_wgrad_cuda(T *input, T *d_output, int in_features, int batch_size, int out_features, T *d_weight, T *d_bias, void *lt_workspace) {
|
|
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
|
// Get the stream from cublas handle to reuse for biasReLU kernel.
|
|
cudaStream_t stream;
|
|
cublasGetStream(handle, &stream);
|
|
const float alpha = 1.0;
|
|
const float beta_zero = 0.0;
|
|
int status = 1;
|
|
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
|
|
status = gemm_bgradb_lt(
|
|
(cublasLtHandle_t)handle,
|
|
CUBLAS_OP_N,
|
|
CUBLAS_OP_T,
|
|
in_features,
|
|
out_features,
|
|
batch_size,
|
|
&alpha, /* host pointer */
|
|
input,
|
|
in_features,
|
|
d_output,
|
|
out_features,
|
|
&beta_zero, /* host pointer */
|
|
d_weight,
|
|
in_features,
|
|
lt_workspace,
|
|
1 << 22,
|
|
stream,
|
|
static_cast<const void*>(d_bias));
|
|
#endif
|
|
|
|
if (status != 0){
|
|
status = gemm_bias(
|
|
handle,
|
|
CUBLAS_OP_N,
|
|
CUBLAS_OP_T,
|
|
in_features,
|
|
out_features,
|
|
batch_size,
|
|
&alpha,
|
|
input,
|
|
in_features,
|
|
d_output,
|
|
out_features,
|
|
&beta_zero,
|
|
d_weight,
|
|
in_features);
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
template <typename T>
|
|
int linear_gelu_forward_cuda(T *input, T *weight, T *bias, int in_features, int batch_size, int out_features, int heuristic, T *output, T *gelu_in, void *lt_workspace) {
|
|
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
|
// Get the stream from cublas handle to reuse for biasReLU kernel.
|
|
cudaStream_t stream;
|
|
cublasGetStream(handle, &stream);
|
|
const float alpha = 1.0;
|
|
const float beta_zero = 0.0;
|
|
int status = 1;
|
|
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
|
|
status = gemm_bias_gelu_lt(
|
|
(cublasLtHandle_t)handle,
|
|
CUBLAS_OP_T,
|
|
CUBLAS_OP_N,
|
|
out_features,
|
|
batch_size,
|
|
in_features,
|
|
&alpha, /* host pointer */
|
|
weight,
|
|
in_features,
|
|
input,
|
|
in_features,
|
|
&beta_zero, /* host pointer */
|
|
output,
|
|
out_features,
|
|
lt_workspace,
|
|
1 << 22,
|
|
stream,
|
|
heuristic,
|
|
static_cast<const void*>(gelu_in),
|
|
static_cast<const void*>(bias));
|
|
return status;
|
|
#else
|
|
return 1;
|
|
#endif
|
|
}
|
|
|
|
template <typename T>
|
|
int bias_gelu_linear_dgrad_bgrad_cuda(T *weight, T *d_output, T *gelu_in, int in_features, int batch_size, int out_features, int heuristic, T *d_input, T *d_bias, void *lt_workspace) {
|
|
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
|
|
// Get the stream from cublas handle to reuse for biasReLU kernel.
|
|
cudaStream_t stream;
|
|
cublasGetStream(handle, &stream);
|
|
const float alpha = 1.0;
|
|
const float beta_zero = 0.0;
|
|
int status = 1;
|
|
#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
|
|
status = gemm_dgelu_bgradb_lt(
|
|
(cublasLtHandle_t)handle,
|
|
CUBLAS_OP_N,
|
|
CUBLAS_OP_N,
|
|
in_features,
|
|
batch_size,
|
|
out_features,
|
|
&alpha, /* host pointer */
|
|
weight,
|
|
in_features,
|
|
d_output,
|
|
out_features,
|
|
&beta_zero, /* host pointer */
|
|
d_input,
|
|
in_features,
|
|
lt_workspace,
|
|
1 << 22,
|
|
stream,
|
|
heuristic,
|
|
static_cast<const void*>(gelu_in),
|
|
static_cast<const void*>(d_bias));
|
|
#endif
|
|
return status;
|
|
|
|
}
|
|
|
|
template int linear_bias_wgrad_cuda<at::Half>(at::Half *input, at::Half *d_output, int in_features, int batch_size, int out_features, at::Half *d_weight, at::Half *d_bias, void *lt_workspace) ;
|
|
template int linear_bias_wgrad_cuda<at::BFloat16>(at::BFloat16 *input, at::BFloat16 *d_output, int in_features, int batch_size, int out_features, at::BFloat16 *d_weight, at::BFloat16 *d_bias, void *lt_workspace) ;
|
|
|
|
template int linear_gelu_forward_cuda<at::Half>(at::Half *input, at::Half *weight, at::Half *bias, int in_features, int batch_size, int out_features, int heuristic, at::Half *output, at::Half *gelu_in, void *lt_workspace) ;
|
|
template int linear_gelu_forward_cuda<at::BFloat16>(at::BFloat16 *input, at::BFloat16 *weight, at::BFloat16 *bias, int in_features, int batch_size, int out_features, int heuristic, at::BFloat16 *output, at::BFloat16 *gelu_in, void *lt_workspace) ;
|
|
|
|
template int bias_gelu_linear_dgrad_bgrad_cuda<at::Half>(at::Half *weight, at::Half *d_output, at::Half *gelu_in, int in_features, int batch_size, int out_features, int heuristic, at::Half *d_input, at::Half *d_bias, void *lt_workspace);
|
|
template int bias_gelu_linear_dgrad_bgrad_cuda<at::BFloat16>(at::BFloat16 *weight, at::BFloat16 *d_output, at::BFloat16 *gelu_in, int in_features, int batch_size, int out_features, int heuristic, at::BFloat16 *d_input, at::BFloat16 *d_bias, void *lt_workspace); |