diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bf6d239..9468cc1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # NVIDIA CUTLASS Changelog +## [1.3.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.1) (2019-04-09) + * Corrected NVRTC unit tests. + ## [1.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.3.0) (2019-03-20) * Efficient GEMM kernel targeting Volta Tensor Cores via `mma.sync` instruction added in CUDA 10.1. diff --git a/README.md b/README.md index 231eafba..4c539e92 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # CUTLASS 1.3 -_CUTLASS 1.3.0 - March 2019_ +_CUTLASS 1.3.1 - April 2019_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. @@ -28,6 +28,10 @@ CUTLASS 1.3 is described in the [CUTLASS Documentation](CUTLASS.md) and the acco We describe the structure of an efficient GEMM in our talk at the [GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf). +# What's New in CUTLASS 1.3.1 +_April 2019_ +* CUTLASS 1.3.1 corrected NVRTC unit tests.. + # What's New in CUTLASS 1.3 _March 2019_ * CUTLASS 1.3 includes an efficient GEMM implementation with the `mma.sync` instruction added in CUDA 10.1. diff --git a/cutlass/cutlass.h b/cutlass/cutlass.h index 26de6c02..783ea3b6 100644 --- a/cutlass/cutlass.h +++ b/cutlass/cutlass.h @@ -34,7 +34,7 @@ #define CUTLASS_MAJOR 1 #define CUTLASS_MINOR 3 -#define CUTLASS_PATCH 0 +#define CUTLASS_PATCH 1 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH) #ifdef __NVCC__ @@ -58,8 +58,13 @@ // CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler. #if defined(__CUDA_ARCH__) +#ifdef __NVCC__ #define CUTLASS_PRAGMA_UNROLL #pragma unroll #define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1 +#elif defined(__CUDACC_RTC__) + #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll") + #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1") +#endif #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL @@ -80,6 +85,7 @@ template struct DebugType {}; template +CUTLASS_HOST_DEVICE void DebugTypeFunc(T const& t) { T::t; } diff --git a/cutlass/gemm/gemm.h b/cutlass/gemm/gemm.h index 0d919199..70cb2ccc 100644 --- a/cutlass/gemm/gemm.h +++ b/cutlass/gemm/gemm.h @@ -33,7 +33,6 @@ #include "cutlass/coord.h" #include "cutlass/util/platform.h" -#include namespace cutlass { namespace gemm { @@ -84,6 +83,7 @@ void gemm_kernel_nolb(typename Gemm_::Params params) { //////////////////////////////////////////////////////////////////////////////////////////////////// +#if !defined(__CUDACC_RTC__) /// Partial specialization for launching the GEMM kernel with or without launch bounds template struct Launch { @@ -152,7 +152,51 @@ struct Launch { smem_size, stream >>>(params); } + + // Use device API to launch kernel + Launch(cudaError_t &result, CUfunction kernel, + typename Gemm::Params params, dim3 grid, dim3 block, CUstream stream = CU_STREAM_LEGACY) { + void* params_[] = {const_cast(reinterpret_cast(¶ms))}; + + int smem_size = int(sizeof(typename Gemm::SharedStorage)); + if (smem_size >= (48 << 10)) { + + result = cudaFuncSetAttribute( + kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size + ); + + if (result != cudaSuccess) { + return; + } + + result = cudaFuncSetAttribute( + kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, + 100); + + if (result != cudaSuccess) { + return; + } + } + + CUresult launch_result = cuLaunchKernel( + kernel, + grid.x, grid.y, grid.z, + block.x, block.y, block.z, + smem_size, stream, params_, 0); + + if (launch_result != CUDA_SUCCESS) { + result = cudaErrorLaunchFailure; + return; + } + + result = cudaSuccess; + return; + } }; +#endif //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -188,20 +232,13 @@ struct Gemm { static __host__ cudaError_t launch(CUfunction kernel, Params const& params, CUstream stream = CU_STREAM_LEGACY) { + cudaError_t result; // Launch the kernel. - void* params_[] = {const_cast(reinterpret_cast(¶ms))}; + Launch( + result, kernel, params, params.grid, params.block, stream); - CUresult result = cuLaunchKernel( - kernel, - params.grid.x, params.grid.y, params.grid.z, - params.block.x, params.block.y, params.block.z, - 0, stream, params_, 0); - - if (result != CUDA_SUCCESS) { - return cudaErrorLaunchFailure; - } - return cudaSuccess; + return result; } #endif diff --git a/cutlass/gemm/scalar_or_pointer.h b/cutlass/gemm/scalar_or_pointer.h index 9e292951..315d1a4c 100644 --- a/cutlass/gemm/scalar_or_pointer.h +++ b/cutlass/gemm/scalar_or_pointer.h @@ -1,3 +1,4 @@ + /*************************************************************************************************** * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * diff --git a/cutlass/layout/thread/transform.h b/cutlass/layout/thread/transform.h index 3abb22e1..8b9445e0 100644 --- a/cutlass/layout/thread/transform.h +++ b/cutlass/layout/thread/transform.h @@ -77,6 +77,7 @@ struct Copy { } }; +#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16) template struct Copy { CUTLASS_DEVICE void copy(cutlass::TensorView dst, @@ -140,6 +141,7 @@ struct Copy struct Transform { typedef Fragment::kCount> DstFragment; @@ -266,6 +269,7 @@ struct Transform { Transformer.copy(dstView, srcView); } }; +#endif template struct Transform { diff --git a/cutlass/tensor_view.h b/cutlass/tensor_view.h index d770a193..28b12e5a 100644 --- a/cutlass/tensor_view.h +++ b/cutlass/tensor_view.h @@ -36,8 +36,6 @@ #pragma once -#include - #include "cutlass/cutlass.h" #include "cutlass/tensor_ref.h" diff --git a/cutlass/tile_iterator.h b/cutlass/tile_iterator.h index 923d7e10..b8a88252 100644 --- a/cutlass/tile_iterator.h +++ b/cutlass/tile_iterator.h @@ -34,7 +34,6 @@ #include "cutlass/load_store.h" #include "cutlass/predicate_vector.h" #include "cutlass/vector.h" -#include namespace cutlass { diff --git a/cutlass/util/performance_tuning.h b/cutlass/util/performance_tuning.h deleted file mode 100644 index fd117740..00000000 --- a/cutlass/util/performance_tuning.h +++ /dev/null @@ -1,40 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are not permitted. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -#pragma once -#ifndef CUTLASS_PERFORMANCE_TUNING_H -#define CUTLASS_PERFORMANCE_TUNING_H - -// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler. - -#if defined(__CUDA_ARCH__) -#if defined(_MSC_VER) -#define CUTLASS_PRAGMA_UNROLL __pragma("unroll") -#define CUTLASS_PRAGMA_NO_UNROLL __pragma("unroll 1") -#else -#define CUTLASS_PRAGMA_UNROLL _Pragma("unroll") -#define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1") -#endif -#else -#define CUTLASS_PRAGMA_UNROLL -#define CUTLASS_PRAGMA_NO_UNROLL -#endif - -#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL -#endif // CUTLASS_PERFORMANCE_TUNING_H diff --git a/cutlass/vector.h b/cutlass/vector.h index 9b8a30ea..e59340b8 100644 --- a/cutlass/vector.h +++ b/cutlass/vector.h @@ -88,6 +88,8 @@ union Vector { //////////////////////////////////////////////////////////////////////////////////////////////////// +#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16) + template <> union Vector { /// The scalar type. @@ -118,7 +120,6 @@ union Vector { } }; -#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16) template union Vector { diff --git a/tools/nvrtc/CMakeLists.txt b/tools/nvrtc/CMakeLists.txt index 2eeb90d0..bb8eeee9 100644 --- a/tools/nvrtc/CMakeLists.txt +++ b/tools/nvrtc/CMakeLists.txt @@ -54,7 +54,7 @@ if (CUTLASS_NVRTC_ENABLE) string(APPEND NVRTC_INCLUDES_STRINGS "char const *kCutlassHeaders[] = {\n") string(APPEND NVRTC_INCLUDES_NAMES "char const *kCutlassHeaderNames[] = {\n") - add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE}") + add_nvrtc_headers(${CMAKE_SOURCE_DIR} "${CUTLASS_CORE};${CUTLASS_GEMM};${CUTLASS_UTIL};${CUTLASS_DEVICE};${CUTLASS_ARCH};${CUTLASS_LAYOUT_THREAD}") message("${CMAKE_CURRENT_SOURCE_DIR}/") add_nvrtc_headers("${CMAKE_CURRENT_SOURCE_DIR}/stdlib" "assert.h;stdint.h") if(CUTLASS_NVRTC_HAS_CUDA_FP16) diff --git a/tools/test/unit/gemm/gemm_nvrtc.cu b/tools/test/unit/gemm/gemm_nvrtc.cu index 89dfe1a6..dcfb1658 100644 --- a/tools/test/unit/gemm/gemm_nvrtc.cu +++ b/tools/test/unit/gemm/gemm_nvrtc.cu @@ -43,6 +43,8 @@ TEST(Dgemm_nvrtc_64x32x8, dgemm_nvrtc_64x32x8_nt) { //////////////////////////////////////////////////////////////////////////////////////////////////// +#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 610)) + TEST(Igemm__nvrtc_128x128x32, igemm_nvrtc_256x256x64_tt) { typedef cutlass::gemm::IgemmTraits(gemm_traits, 256, 256, 64); } +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// TEST(Sgemm_nvrtc_128x128x8, sgemm_nvrtc_128x112x16_alpha2_beta1_nt) { diff --git a/tools/test/unit/gemm/gemm_nvrtc.h b/tools/test/unit/gemm/gemm_nvrtc.h index 493eafb4..10123d68 100644 --- a/tools/test/unit/gemm/gemm_nvrtc.h +++ b/tools/test/unit/gemm/gemm_nvrtc.h @@ -30,6 +30,7 @@ #include #include "tools/nvrtc/cutlass/nvrtc/environment.h" #include +#include static inline bool check_nvrtc_error(nvrtcResult error) { if (error != NVRTC_SUCCESS) { @@ -67,31 +68,36 @@ static __host__ void run_gemm_nvrtc( testbed(m, n, k, - cutlass::convert(GemmTraits_::kLayoutA), - cutlass::convert(GemmTraits_::kLayoutB), + test::convert(GemmTraits_::kLayoutA), + test::convert(GemmTraits_::kLayoutB), alpha, beta); + int currentDevice; + cudaGetDevice(¤tDevice); + + // generate the architecture string for the nvrtc conmpiler + cudaDeviceProp deviceProperties; + cudaGetDeviceProperties(&deviceProperties, currentDevice); + std::stringstream arch; + arch << "-arch=compute_" << deviceProperties.major << deviceProperties.minor; + // Instantiate gemm_kernel nvrtcResult result_nvrtc; nvrtcProgram program; static char const *src = - "#include "cutlass/gemm/gemm.h"\n" - "#include "cutlass/gemm/sgemm_traits.h"\n" - "#include "cutlass/gemm/dgemm_traits.h"\n" - "#include "cutlass/gemm/igemm_traits.h"\n" + "#include \"cutlass/gemm/gemm.h\"\n" + "#include \"cutlass/gemm/sgemm_traits.h\"\n" + "#include \"cutlass/gemm/dgemm_traits.h\"\n" + "#include \"cutlass/gemm/igemm_traits.h\"\n" #if defined(CUTLASS_NVRTC_HAS_FP16) - "#include "cutlass/gemm/hgemm_traits.h"\n" - "#include "cutlass/gemm/wmma_gemm_traits.h"\n" + "#include \"cutlass/gemm/hgemm_traits.h\"\n" + "#include \"cutlass/gemm/wmma_gemm_traits.h\"\n" #endif ; std::string type_name; -#if 0 - nvrtcGetTypeName(&type_name); -#else - type_name = gemm_traits; -#endif + nvrtcGetTypeName(&type_name); result_nvrtc = nvrtcCreateProgram(&program, src, @@ -102,10 +108,22 @@ static __host__ void run_gemm_nvrtc( check_nvrtc_error(result_nvrtc); std::string gemm_kernel_instantiation = - "cutlass::gemm::gemm_kernel >"; + "cutlass::gemm::gemm_kernel::KernelClass >"; nvrtcAddNameExpression(program, gemm_kernel_instantiation.c_str()); - result_nvrtc = nvrtcCompileProgram(program, 0, NULL); + // generate option list to genereate kernel for the underlying GPU + std::vector options; + std::vector c_options; + + options.push_back(arch.str()); + + // convert option list into a c-string list for the nvrtc interface + for (std::vector::const_iterator i = options.begin(); i != options.end(); ++i) { + c_options.push_back(i->c_str()); + } + + // compile + result_nvrtc = nvrtcCompileProgram(program, int(c_options.size()), c_options.data()); if (result_nvrtc != NVRTC_SUCCESS) { size_t logSize; nvrtcGetProgramLogSize(program, &logSize); @@ -118,11 +136,13 @@ static __host__ void run_gemm_nvrtc( } // The lowered name is the name of the template instantiation in the generated PTX code. - char const *gemm_kernel_lowered_name; - nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &gemm_kernel_lowered_name); + char const *temp_gemm_kernel_lowered_name; + nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &temp_gemm_kernel_lowered_name); if (!check_nvrtc_error(result_nvrtc)) { ASSERT_TRUE(false); } + // the ponter we got from nvrtcGetLoweredName is valid only as long as the program is valid. create a copy. + std::string gemm_kernel_lowered_name(temp_gemm_kernel_lowered_name); // Query the size of the genereated PTX so that we can allocate storage and retrieve it afterwards size_t ptx_size; @@ -134,22 +154,32 @@ static __host__ void run_gemm_nvrtc( std::vector ptx(ptx_size); result_nvrtc = nvrtcGetPTX(program, ptx.data()); if (!check_nvrtc_error(result_nvrtc)) { + std::cerr << "failed to get ptx" << std::endl; ASSERT_TRUE(false); } // we do not need the nvrtc program anymore nvrtcDestroyProgram(&program); + // Now load the module CUmodule module; CUresult result_cuda; + result_cuda = cuModuleLoadDataEx(&module, ptx.data(), 0, 0, 0); if (result_cuda != CUDA_SUCCESS) { + const char *msg; + cuGetErrorName(result_cuda, &msg); + std::cerr << "\ncuModuleLoadDataEx error: failed with error " << msg << std::endl; ASSERT_TRUE(false); } + // and retrieve the function CUfunction kernel; - result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name); + result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name.c_str()); if (result_cuda != CUDA_SUCCESS) { + const char *msg; + cuGetErrorName(result_cuda, &msg); + std::cerr << "\ncuModuleGetFunction error: failed with error " << msg << std::endl; ASSERT_TRUE(false); } @@ -173,16 +203,23 @@ static __host__ void run_gemm_nvrtc( testbed.ptr_computed(), testbed.ldc()); - // Gemm::launch(params); Gemm::launch(kernel, params); cudaError_t result = cudaDeviceSynchronize(); ASSERT_EQ(result, cudaSuccess) << "\nCUDA kernel launch error: " << cudaGetErrorString(result) - << "\n"; + << std::endl; if (testbed.has_cublas_support()) { ASSERT_TRUE(testbed.verify_with_cublas()); } else { ASSERT_TRUE(testbed.verify_with_host()); } + + result_cuda = cuModuleUnload(module); + if (result_cuda != CUDA_SUCCESS) { + const char *msg; + cuGetErrorName(result_cuda, &msg); + std::cerr << "\ncuModuleUnload error: failed with error " << msg << std::endl; + ASSERT_TRUE(false); + } }