cutlass 1.3.1 (#46)
CUTLASS 1.3.1 patch resolves failing text with NVRTC.
This commit is contained in:
parent
877bdcace6
commit
fe3438a3c1
@ -1,5 +1,8 @@
|
|||||||
# NVIDIA CUTLASS Changelog
|
# NVIDIA CUTLASS Changelog
|
||||||
|
|
||||||
|
## [1.3.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.1) (2019-04-09)
|
||||||
|
* Corrected NVRTC unit tests.
|
||||||
|
|
||||||
## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20)
|
## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20)
|
||||||
* Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1.
|
* Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1.
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
# CUTLASS 1.3
|
# CUTLASS 1.3
|
||||||
|
|
||||||
_CUTLASS 1.3.0 - March 2019_
|
_CUTLASS 1.3.1 - April 2019_
|
||||||
|
|
||||||
CUTLASS is a collection of CUDA C++ template abstractions for implementing
|
CUTLASS is a collection of CUDA C++ template abstractions for implementing
|
||||||
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
|
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
|
||||||
@ -28,6 +28,10 @@ CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the acco
|
|||||||
We describe the structure of an efficient GEMM in our talk at the
|
We describe the structure of an efficient GEMM in our talk at the
|
||||||
[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
|
[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
|
||||||
|
|
||||||
|
# What's New in CUTLASS 1.3.1
|
||||||
|
_April 2019_
|
||||||
|
* CUTLASS 1.3.1 corrected NVRTC unit tests..
|
||||||
|
|
||||||
# What's New in CUTLASS 1.3
|
# What's New in CUTLASS 1.3
|
||||||
_March 2019_
|
_March 2019_
|
||||||
* CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1.
|
* CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1.
|
||||||
|
@ -34,7 +34,7 @@
|
|||||||
|
|
||||||
#define CUTLASS_MAJOR 1
|
#define CUTLASS_MAJOR 1
|
||||||
#define CUTLASS_MINOR 3
|
#define CUTLASS_MINOR 3
|
||||||
#define CUTLASS_PATCH 0
|
#define CUTLASS_PATCH 1
|
||||||
#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
|
#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
|
||||||
|
|
||||||
#ifdef __NVCC__
|
#ifdef __NVCC__
|
||||||
@ -58,8 +58,13 @@
|
|||||||
|
|
||||||
// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
|
// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
|
||||||
#if defined(__CUDA_ARCH__)
|
#if defined(__CUDA_ARCH__)
|
||||||
|
#ifdef __NVCC__
|
||||||
#define CUTLASS_PRAGMA_UNROLL #pragma unroll
|
#define CUTLASS_PRAGMA_UNROLL #pragma unroll
|
||||||
#define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
|
#define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
|
||||||
|
#elif defined(__CUDACC_RTC__)
|
||||||
|
#define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
|
||||||
|
#define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
|
||||||
|
#endif
|
||||||
|
|
||||||
#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
|
#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
|
||||||
|
|
||||||
@ -80,6 +85,7 @@ template <typename T>
|
|||||||
struct DebugType {};
|
struct DebugType {};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
CUTLASS_HOST_DEVICE
|
||||||
void DebugTypeFunc(T const& t) {
|
void DebugTypeFunc(T const& t) {
|
||||||
T::t;
|
T::t;
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,6 @@
|
|||||||
|
|
||||||
#include "cutlass/coord.h"
|
#include "cutlass/coord.h"
|
||||||
#include "cutlass/util/platform.h"
|
#include "cutlass/util/platform.h"
|
||||||
#include <cstdio>
|
|
||||||
namespace cutlass {
|
namespace cutlass {
|
||||||
namespace gemm {
|
namespace gemm {
|
||||||
|
|
||||||
@ -84,6 +83,7 @@ void gemm_kernel_nolb(typename Gemm_::Params params) {
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#if !defined(__CUDACC_RTC__)
|
||||||
/// Partial specialization for launching the GEMM kernel with or without launch bounds
|
/// Partial specialization for launching the GEMM kernel with or without launch bounds
|
||||||
template <typename Gemm, bool WithLaunchBounds>
|
template <typename Gemm, bool WithLaunchBounds>
|
||||||
struct Launch {
|
struct Launch {
|
||||||
@ -152,7 +152,51 @@ struct Launch<Gemm, false> {
|
|||||||
smem_size,
|
smem_size,
|
||||||
stream >>>(params);
|
stream >>>(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Use device API to launch kernel
|
||||||
|
Launch(cudaError_t &result, CUfunction kernel,
|
||||||
|
typename Gemm::Params params, dim3 grid, dim3 block, CUstream stream = CU_STREAM_LEGACY) {
|
||||||
|
void* params_[] = {const_cast<void*>(reinterpret_cast<void const*>(¶ms))};
|
||||||
|
|
||||||
|
int smem_size = int(sizeof(typename Gemm::SharedStorage));
|
||||||
|
if (smem_size >= (48 << 10)) {
|
||||||
|
|
||||||
|
result = cudaFuncSetAttribute(
|
||||||
|
kernel,
|
||||||
|
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||||
|
smem_size
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result != cudaSuccess) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
result = cudaFuncSetAttribute(
|
||||||
|
kernel,
|
||||||
|
cudaFuncAttributePreferredSharedMemoryCarveout,
|
||||||
|
100);
|
||||||
|
|
||||||
|
if (result != cudaSuccess) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CUresult launch_result = cuLaunchKernel(
|
||||||
|
kernel,
|
||||||
|
grid.x, grid.y, grid.z,
|
||||||
|
block.x, block.y, block.z,
|
||||||
|
smem_size, stream, params_, 0);
|
||||||
|
|
||||||
|
if (launch_result != CUDA_SUCCESS) {
|
||||||
|
result = cudaErrorLaunchFailure;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
result = cudaSuccess;
|
||||||
|
return;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@ -188,20 +232,13 @@ struct Gemm {
|
|||||||
static __host__ cudaError_t launch(CUfunction kernel,
|
static __host__ cudaError_t launch(CUfunction kernel,
|
||||||
Params const& params,
|
Params const& params,
|
||||||
CUstream stream = CU_STREAM_LEGACY) {
|
CUstream stream = CU_STREAM_LEGACY) {
|
||||||
|
cudaError_t result;
|
||||||
|
|
||||||
// Launch the kernel.
|
// Launch the kernel.
|
||||||
void* params_[] = {const_cast<void*>(reinterpret_cast<void const*>(¶ms))};
|
Launch<KernelClass, Traits::GemmConfig::kLaunchBounds>(
|
||||||
|
result, kernel, params, params.grid, params.block, stream);
|
||||||
|
|
||||||
CUresult result = cuLaunchKernel(
|
return result;
|
||||||
kernel,
|
|
||||||
params.grid.x, params.grid.y, params.grid.z,
|
|
||||||
params.block.x, params.block.y, params.block.z,
|
|
||||||
0, stream, params_, 0);
|
|
||||||
|
|
||||||
if (result != CUDA_SUCCESS) {
|
|
||||||
return cudaErrorLaunchFailure;
|
|
||||||
}
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
/***************************************************************************************************
|
/***************************************************************************************************
|
||||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
|
@ -77,6 +77,7 @@ struct Copy {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
|
||||||
template <int rank>
|
template <int rank>
|
||||||
struct Copy<half, half, rank, cutlass::MatrixLayout::RowMajor, cutlass::MatrixLayout::RowMajor> {
|
struct Copy<half, half, rank, cutlass::MatrixLayout::RowMajor, cutlass::MatrixLayout::RowMajor> {
|
||||||
CUTLASS_DEVICE void copy(cutlass::TensorView<half, rank, cutlass::MatrixLayout::RowMajor> dst,
|
CUTLASS_DEVICE void copy(cutlass::TensorView<half, rank, cutlass::MatrixLayout::RowMajor> dst,
|
||||||
@ -140,6 +141,7 @@ struct Copy<half, half, 2, cutlass::MatrixLayout::RowMajor, cutlass::MatrixLayou
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
/// igemm swizzle
|
/// igemm swizzle
|
||||||
/// Transform a fragment.
|
/// Transform a fragment.
|
||||||
@ -239,6 +241,7 @@ struct Transform {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
|
||||||
template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
|
template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
|
||||||
struct Transform<Shape, Rank, half, DstLayout, half, SrcLayout> {
|
struct Transform<Shape, Rank, half, DstLayout, half, SrcLayout> {
|
||||||
typedef Fragment<half, ShapeCount<Shape>::kCount> DstFragment;
|
typedef Fragment<half, ShapeCount<Shape>::kCount> DstFragment;
|
||||||
@ -266,6 +269,7 @@ struct Transform<Shape, Rank, half, DstLayout, half, SrcLayout> {
|
|||||||
Transformer.copy(dstView, srcView);
|
Transformer.copy(dstView, srcView);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
|
template <typename Shape, int Rank, typename DstLayout, typename SrcLayout>
|
||||||
struct Transform<Shape, Rank, int8_t, DstLayout, int8_t, SrcLayout> {
|
struct Transform<Shape, Rank, int8_t, DstLayout, int8_t, SrcLayout> {
|
||||||
|
@ -36,8 +36,6 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
#include "cutlass/cutlass.h"
|
#include "cutlass/cutlass.h"
|
||||||
#include "cutlass/tensor_ref.h"
|
#include "cutlass/tensor_ref.h"
|
||||||
|
|
||||||
|
@ -34,7 +34,6 @@
|
|||||||
#include "cutlass/load_store.h"
|
#include "cutlass/load_store.h"
|
||||||
#include "cutlass/predicate_vector.h"
|
#include "cutlass/predicate_vector.h"
|
||||||
#include "cutlass/vector.h"
|
#include "cutlass/vector.h"
|
||||||
#include <cstdio>
|
|
||||||
|
|
||||||
namespace cutlass {
|
namespace cutlass {
|
||||||
|
|
||||||
|
@ -1,40 +0,0 @@
|
|||||||
/******************************************************************************
|
|
||||||
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are not permitted.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
||||||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
||||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
||||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
||||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*
|
|
||||||
******************************************************************************/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#ifndef CUTLASS_PERFORMANCE_TUNING_H
|
|
||||||
#define CUTLASS_PERFORMANCE_TUNING_H
|
|
||||||
|
|
||||||
// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
|
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
#define CUTLASS_PRAGMA_UNROLL __pragma("unroll")
|
|
||||||
#define CUTLASS_PRAGMA_NO_UNROLL __pragma("unroll 1")
|
|
||||||
#else
|
|
||||||
#define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
|
|
||||||
#define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define CUTLASS_PRAGMA_UNROLL
|
|
||||||
#define CUTLASS_PRAGMA_NO_UNROLL
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
|
|
||||||
#endif // CUTLASS_PERFORMANCE_TUNING_H
|
|
@ -88,6 +88,8 @@ union Vector {
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
union Vector<half, 1> {
|
union Vector<half, 1> {
|
||||||
/// The scalar type.
|
/// The scalar type.
|
||||||
@ -118,7 +120,6 @@ union Vector<half, 1> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
|
|
||||||
|
|
||||||
template <int kLanes_>
|
template <int kLanes_>
|
||||||
union Vector<half, kLanes_> {
|
union Vector<half, kLanes_> {
|
||||||
|
@ -54,7 +54,7 @@ if (CUTLASS_NVRTC_ENABLE)
|
|||||||
string(APPEND NVRTC_INCLUDES_STRINGS "char const *kCutlassHeaders[] = {\n")
|
string(APPEND NVRTC_INCLUDES_STRINGS "char const *kCutlassHeaders[] = {\n")
|
||||||
string(APPEND NVRTC_INCLUDES_NAMES "char const *kCutlassHeaderNames[] = {\n")
|
string(APPEND NVRTC_INCLUDES_NAMES "char const *kCutlassHeaderNames[] = {\n")
|
||||||
|
|
||||||
add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE}")
|
add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE};${CUTLASS_ARCH};${CUTLASS_LAYOUT_THREAD}")
|
||||||
message("${CMAKE_CURRENT_SOURCE_DIR}/")
|
message("${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||||
add_nvrtc_headers("${CMAKE_CURRENT_SOURCE_DIR}/stdlib" "assert.h;stdint.h")
|
add_nvrtc_headers("${CMAKE_CURRENT_SOURCE_DIR}/stdlib" "assert.h;stdint.h")
|
||||||
if(CUTLASS_NVRTC_HAS_CUDA_FP16)
|
if(CUTLASS_NVRTC_HAS_CUDA_FP16)
|
||||||
|
@ -43,6 +43,8 @@ TEST(Dgemm_nvrtc_64x32x8, dgemm_nvrtc_64x32x8_nt) {
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 610))
|
||||||
|
|
||||||
TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) {
|
TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) {
|
||||||
typedef cutlass::gemm::IgemmTraits<cutlass::MatrixLayout::kRowMajor,
|
typedef cutlass::gemm::IgemmTraits<cutlass::MatrixLayout::kRowMajor,
|
||||||
cutlass::MatrixLayout::kRowMajor,
|
cutlass::MatrixLayout::kRowMajor,
|
||||||
@ -52,6 +54,8 @@ TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) {
|
|||||||
run_gemm_nvrtc<IgemmTraits>(gemm_traits, 256, 256, 64);
|
run_gemm_nvrtc<IgemmTraits>(gemm_traits, 256, 256, 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
TEST(Sgemm_nvrtc_128x128x8, sgemm_nvrtc_128x112x16_alpha2_beta1_nt) {
|
TEST(Sgemm_nvrtc_128x128x8, sgemm_nvrtc_128x112x16_alpha2_beta1_nt) {
|
||||||
|
@ -30,6 +30,7 @@
|
|||||||
#include <nvrtc.h>
|
#include <nvrtc.h>
|
||||||
#include "tools/nvrtc/cutlass/nvrtc/environment.h"
|
#include "tools/nvrtc/cutlass/nvrtc/environment.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
static inline bool check_nvrtc_error(nvrtcResult error) {
|
static inline bool check_nvrtc_error(nvrtcResult error) {
|
||||||
if (error != NVRTC_SUCCESS) {
|
if (error != NVRTC_SUCCESS) {
|
||||||
@ -67,31 +68,36 @@ static __host__ void run_gemm_nvrtc(
|
|||||||
testbed(m,
|
testbed(m,
|
||||||
n,
|
n,
|
||||||
k,
|
k,
|
||||||
cutlass::convert(GemmTraits_::kLayoutA),
|
test::convert(GemmTraits_::kLayoutA),
|
||||||
cutlass::convert(GemmTraits_::kLayoutB),
|
test::convert(GemmTraits_::kLayoutB),
|
||||||
alpha,
|
alpha,
|
||||||
beta);
|
beta);
|
||||||
|
|
||||||
|
int currentDevice;
|
||||||
|
cudaGetDevice(¤tDevice);
|
||||||
|
|
||||||
|
// generate the architecture string for the nvrtc conmpiler
|
||||||
|
cudaDeviceProp deviceProperties;
|
||||||
|
cudaGetDeviceProperties(&deviceProperties, currentDevice);
|
||||||
|
std::stringstream arch;
|
||||||
|
arch << "-arch=compute_" << deviceProperties.major << deviceProperties.minor;
|
||||||
|
|
||||||
// Instantiate gemm_kernel
|
// Instantiate gemm_kernel
|
||||||
nvrtcResult result_nvrtc;
|
nvrtcResult result_nvrtc;
|
||||||
nvrtcProgram program;
|
nvrtcProgram program;
|
||||||
static char const *src =
|
static char const *src =
|
||||||
"#include "cutlass/gemm/gemm.h"\n"
|
"#include \"cutlass/gemm/gemm.h\"\n"
|
||||||
"#include "cutlass/gemm/sgemm_traits.h"\n"
|
"#include \"cutlass/gemm/sgemm_traits.h\"\n"
|
||||||
"#include "cutlass/gemm/dgemm_traits.h"\n"
|
"#include \"cutlass/gemm/dgemm_traits.h\"\n"
|
||||||
"#include "cutlass/gemm/igemm_traits.h"\n"
|
"#include \"cutlass/gemm/igemm_traits.h\"\n"
|
||||||
#if defined(CUTLASS_NVRTC_HAS_FP16)
|
#if defined(CUTLASS_NVRTC_HAS_FP16)
|
||||||
"#include "cutlass/gemm/hgemm_traits.h"\n"
|
"#include \"cutlass/gemm/hgemm_traits.h\"\n"
|
||||||
"#include "cutlass/gemm/wmma_gemm_traits.h"\n"
|
"#include \"cutlass/gemm/wmma_gemm_traits.h\"\n"
|
||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
|
|
||||||
std::string type_name;
|
std::string type_name;
|
||||||
#if 0
|
nvrtcGetTypeName<GemmTraits_>(&type_name);
|
||||||
nvrtcGetTypeName<typename GemmTraits_>(&type_name);
|
|
||||||
#else
|
|
||||||
type_name = gemm_traits;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
result_nvrtc = nvrtcCreateProgram(&program,
|
result_nvrtc = nvrtcCreateProgram(&program,
|
||||||
src,
|
src,
|
||||||
@ -102,10 +108,22 @@ static __host__ void run_gemm_nvrtc(
|
|||||||
check_nvrtc_error(result_nvrtc);
|
check_nvrtc_error(result_nvrtc);
|
||||||
|
|
||||||
std::string gemm_kernel_instantiation =
|
std::string gemm_kernel_instantiation =
|
||||||
"cutlass::gemm::gemm_kernel<cutlass::gemm::Gemm< " + type_name + " > >";
|
"cutlass::gemm::gemm_kernel<cutlass::gemm::Gemm< " + type_name + " >::KernelClass >";
|
||||||
nvrtcAddNameExpression(program, gemm_kernel_instantiation.c_str());
|
nvrtcAddNameExpression(program, gemm_kernel_instantiation.c_str());
|
||||||
|
|
||||||
result_nvrtc = nvrtcCompileProgram(program, 0, NULL);
|
// generate option list to genereate kernel for the underlying GPU
|
||||||
|
std::vector<std::string> options;
|
||||||
|
std::vector<const char*> c_options;
|
||||||
|
|
||||||
|
options.push_back(arch.str());
|
||||||
|
|
||||||
|
// convert option list into a c-string list for the nvrtc interface
|
||||||
|
for (std::vector<std::string>::const_iterator i = options.begin(); i != options.end(); ++i) {
|
||||||
|
c_options.push_back(i->c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// compile
|
||||||
|
result_nvrtc = nvrtcCompileProgram(program, int(c_options.size()), c_options.data());
|
||||||
if (result_nvrtc != NVRTC_SUCCESS) {
|
if (result_nvrtc != NVRTC_SUCCESS) {
|
||||||
size_t logSize;
|
size_t logSize;
|
||||||
nvrtcGetProgramLogSize(program, &logSize);
|
nvrtcGetProgramLogSize(program, &logSize);
|
||||||
@ -118,11 +136,13 @@ static __host__ void run_gemm_nvrtc(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// The lowered name is the name of the template instantiation in the generated PTX code.
|
// The lowered name is the name of the template instantiation in the generated PTX code.
|
||||||
char const *gemm_kernel_lowered_name;
|
char const *temp_gemm_kernel_lowered_name;
|
||||||
nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &gemm_kernel_lowered_name);
|
nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &temp_gemm_kernel_lowered_name);
|
||||||
if (!check_nvrtc_error(result_nvrtc)) {
|
if (!check_nvrtc_error(result_nvrtc)) {
|
||||||
ASSERT_TRUE(false);
|
ASSERT_TRUE(false);
|
||||||
}
|
}
|
||||||
|
// the ponter we got from nvrtcGetLoweredName is valid only as long as the program is valid. create a copy.
|
||||||
|
std::string gemm_kernel_lowered_name(temp_gemm_kernel_lowered_name);
|
||||||
|
|
||||||
// Query the size of the genereated PTX so that we can allocate storage and retrieve it afterwards
|
// Query the size of the genereated PTX so that we can allocate storage and retrieve it afterwards
|
||||||
size_t ptx_size;
|
size_t ptx_size;
|
||||||
@ -134,22 +154,32 @@ static __host__ void run_gemm_nvrtc(
|
|||||||
std::vector<char> ptx(ptx_size);
|
std::vector<char> ptx(ptx_size);
|
||||||
result_nvrtc = nvrtcGetPTX(program, ptx.data());
|
result_nvrtc = nvrtcGetPTX(program, ptx.data());
|
||||||
if (!check_nvrtc_error(result_nvrtc)) {
|
if (!check_nvrtc_error(result_nvrtc)) {
|
||||||
|
std::cerr << "failed to get ptx" << std::endl;
|
||||||
ASSERT_TRUE(false);
|
ASSERT_TRUE(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// we do not need the nvrtc program anymore
|
// we do not need the nvrtc program anymore
|
||||||
nvrtcDestroyProgram(&program);
|
nvrtcDestroyProgram(&program);
|
||||||
|
|
||||||
|
// Now load the module
|
||||||
CUmodule module;
|
CUmodule module;
|
||||||
CUresult result_cuda;
|
CUresult result_cuda;
|
||||||
|
|
||||||
result_cuda = cuModuleLoadDataEx(&module, ptx.data(), 0, 0, 0);
|
result_cuda = cuModuleLoadDataEx(&module, ptx.data(), 0, 0, 0);
|
||||||
if (result_cuda != CUDA_SUCCESS) {
|
if (result_cuda != CUDA_SUCCESS) {
|
||||||
|
const char *msg;
|
||||||
|
cuGetErrorName(result_cuda, &msg);
|
||||||
|
std::cerr << "\ncuModuleLoadDataEx error: failed with error " << msg << std::endl;
|
||||||
ASSERT_TRUE(false);
|
ASSERT_TRUE(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// and retrieve the function
|
||||||
CUfunction kernel;
|
CUfunction kernel;
|
||||||
result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name);
|
result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name.c_str());
|
||||||
if (result_cuda != CUDA_SUCCESS) {
|
if (result_cuda != CUDA_SUCCESS) {
|
||||||
|
const char *msg;
|
||||||
|
cuGetErrorName(result_cuda, &msg);
|
||||||
|
std::cerr << "\ncuModuleGetFunction error: failed with error " << msg << std::endl;
|
||||||
ASSERT_TRUE(false);
|
ASSERT_TRUE(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -173,16 +203,23 @@ static __host__ void run_gemm_nvrtc(
|
|||||||
testbed.ptr_computed(),
|
testbed.ptr_computed(),
|
||||||
testbed.ldc());
|
testbed.ldc());
|
||||||
|
|
||||||
// Gemm::launch(params);
|
|
||||||
Gemm::launch(kernel, params);
|
Gemm::launch(kernel, params);
|
||||||
|
|
||||||
cudaError_t result = cudaDeviceSynchronize();
|
cudaError_t result = cudaDeviceSynchronize();
|
||||||
ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
|
ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result)
|
||||||
<< "\n";
|
<< std::endl;
|
||||||
|
|
||||||
if (testbed.has_cublas_support()) {
|
if (testbed.has_cublas_support()) {
|
||||||
ASSERT_TRUE(testbed.verify_with_cublas());
|
ASSERT_TRUE(testbed.verify_with_cublas());
|
||||||
} else {
|
} else {
|
||||||
ASSERT_TRUE(testbed.verify_with_host());
|
ASSERT_TRUE(testbed.verify_with_host());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
result_cuda = cuModuleUnload(module);
|
||||||
|
if (result_cuda != CUDA_SUCCESS) {
|
||||||
|
const char *msg;
|
||||||
|
cuGetErrorName(result_cuda, &msg);
|
||||||
|
std::cerr << "\ncuModuleUnload error: failed with error " << msg << std::endl;
|
||||||
|
ASSERT_TRUE(false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user