From e9e7cd4d44f5ce2f8be0e7d6f80b76346e9e5312 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Mon, 11 Dec 2017 14:16:52 -0800 Subject: [PATCH 1/6] Make cutlass compilable with clang. E.g: PATH=/nvcc/path/bin:/clang/path/bin:$PATH make sm=35,60 compiler=clang all --- common.mk | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/common.mk b/common.mk index bdf7559b..8ef8fe9d 100644 --- a/common.mk +++ b/common.mk @@ -41,51 +41,64 @@ endif ifeq (70, $(findstring 70, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_70 endif ifeq (62, $(findstring 62, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_62 endif ifeq (61, $(findstring 61, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_61 endif ifeq (60, $(findstring 60, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_60 endif ifeq (52, $(findstring 52, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_52 endif ifeq (37, $(findstring 37, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_37 endif ifeq (35, $(findstring 35, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_35 endif ifeq (30, $(findstring 30, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_30 endif ifeq (21, $(findstring 21, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_20,code=\"sm_21,compute_20\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_21 endif ifeq (20, $(findstring 20, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_20 endif # [verbose=<0|1>] Verbose toolchain output from nvcc option ifeq ($(verbose), 1) NVCCFLAGS += -v + CLANG_CFLAGS += -v endif # [keep=<0|1>] Keep intermediate compilation artifacts option ifeq ($(keep), 1) NVCCFLAGS += -keep + CLANG_CFLAGS += --save-temps endif # [debug=<0|1>] Generate debug mode code ifeq ($(debug), 1) NVCCFLAGS += -G + CLANG_CFLAGS += --cuda-noopt-device-debug endif @@ -107,7 +120,7 @@ OSUPPER := $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) # Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases NVCCFLAGS += -O3 -Xptxas -v - +CLANG_CFLAGS += -O3 -Xcuda-ptxas -v ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER))) # For MSVC @@ -139,9 +152,25 @@ else endif +# compiler=clang Enables compilation with clang. -# Suffix to append to each binary -BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION) +ifeq ($(compiler), clang) + BIN_SUFFIX := sm$(SM_ARCH)_clang_$(NVCC_VERSION) + # Clangs needs few extra flags to point it to CUDA SDK + # and link the binaries with CUDA runtime. + CUDA_BASE=$(realpath $(join $(dir $(shell which nvcc)), ..)) + CLANG_CFLAGS += --cuda-path=$(CUDA_BASE) + LIBINC += -L$(CUDA_BASE)/lib64 -Wl,-rpath=$(CUDA_BASE)/lib64 + LIBS += -lcudart + + # Replace NVCC and its options with clang++. + NVCC = clang++ + NVCCFLAGS = $(CLANG_CFLAGS) + SM_TARGETS = $(CLANG_SM_TARGETS) +else + # Suffix to append to each binary + BIN_SUFFIX := sm$(SM_ARCH)_nvcc_$(NVCC_VERSION) +endif #------------------------------------------------------------------------------- @@ -149,5 +178,3 @@ BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION) #------------------------------------------------------------------------------- rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) - - From ce2b3f695dc05e68348f2e1e14d9479c3578772c Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Fri, 8 Dec 2017 10:39:55 -0800 Subject: [PATCH 2/6] Fixed debug macros for clang. Unlike nvcc, clang always sees both host and device-side code during compilation. CUDA_LOG macro is used in both host and device code, so when it expanded to contain device-only code, that resulted in errors when it was used from the host-side functions. In order to make CUDA_LOG work with clang it was split into two parts -- a pair of target-attribute-based overloaded functions that perform host or device specific parts of logging, and a printf which works on both sides. --- cutlass/util/debug.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h index 81650932..3a4b2fd0 100644 --- a/cutlass/util/debug.h +++ b/cutlass/util/debug.h @@ -44,10 +44,26 @@ namespace cutlass { * Formats and prints the given message to stdout */ #if !defined(CUDA_LOG) - #if !defined(__CUDA_ARCH__) - #define CUDA_LOG(format, ...) printf(format,__VA_ARGS__) - #else - #define CUDA_LOG(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, __VA_ARGS__); + #if defined(__clang__) && defined(__CUDA__) +static __device__ void cuda_log_location() { + printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y, + blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z); +} +static __host__ void cuda_log_location() {} + #define CUDA_LOG(format, ...) \ + do { \ + cuda_log_location(); \ + printf(format, __VA_ARGS__); \ + } while (0) + #else // NVCC + #if !defined(__CUDA_ARCH__) + #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) + #else + #define CUDA_LOG(format, ...) \ + printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, \ + threadIdx.y, threadIdx.z, __VA_ARGS__); + #endif #endif #endif From 81957b3a3ddd5b43683f3e16ef43b9d0a53f5d79 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Fri, 8 Dec 2017 16:59:32 -0800 Subject: [PATCH 3/6] Force inlining of few functions that rely on that for performance. Clang is less agressive than nvccnvcc, so number of functions did not getn inlined into the kernel by default. That prevented SROA from eliminating loads/stores to temporary buffers and resulted in abysmal performance. Replaced inline with __forceinline__ to ensure that we do inline the functions necessary for optimal performance. --- cutlass/gemm/block_task.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cutlass/gemm/block_task.h b/cutlass/gemm/block_task.h index d5700c50..3940fb6e 100644 --- a/cutlass/gemm/block_task.h +++ b/cutlass/gemm/block_task.h @@ -484,7 +484,8 @@ struct block_task * - Applies the scalar multipliers and addends to the accumulators * - Write the result to the output matrix */ - inline __device__ void epilogue() + __forceinline__ __device__ + void epilogue() { // Wait for predecessor thread block(s) to produce block-wide tile of // exclsuive partial-sums @@ -551,7 +552,7 @@ struct block_task * Consume a tile of A and B each */ template - inline __device__ + __forceinline__ __device__ void consume_tile() { // Unroll BlockDpVectorsK iterations of outer-product accumulations @@ -612,7 +613,7 @@ struct block_task /** * Compute GEMM */ - inline __device__ + __forceinline__ __device__ void run() { // Quit if the thread block is fully out-of-bounds From df4b4e4bb6a084127e61e6bdd1678eb19e26af94 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Mon, 11 Dec 2017 16:34:10 -0800 Subject: [PATCH 4/6] Added _cuda_ to the name of the executable to indicate that it's not clang's version. --- common.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common.mk b/common.mk index 8ef8fe9d..672ea5b2 100644 --- a/common.mk +++ b/common.mk @@ -155,7 +155,8 @@ endif # compiler=clang Enables compilation with clang. ifeq ($(compiler), clang) - BIN_SUFFIX := sm$(SM_ARCH)_clang_$(NVCC_VERSION) + # NVCC_VERSION is used as the proxy for the CUDA version. + BIN_SUFFIX := sm$(SM_ARCH)_clang_cuda_$(NVCC_VERSION) # Clangs needs few extra flags to point it to CUDA SDK # and link the binaries with CUDA runtime. CUDA_BASE=$(realpath $(join $(dir $(shell which nvcc)), ..)) From 39616514d0d0b7cadbaabffeb55f863774b282db Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 3 Jan 2018 16:36:50 -0800 Subject: [PATCH 5/6] Reworked CUDA_LOG macro to print location&the message with one printf. This replies on the fact that clang allows using device-side features from __host__/__device__ functions from __host__ ones as long as we don't have to generate code for that. Wrapping thread/blockIdx in __host__ __device__ function allows using CUDA_LOG everywhere during host and device compilation. --- cutlass/util/debug.h | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h index 3a4b2fd0..c10e447f 100644 --- a/cutlass/util/debug.h +++ b/cutlass/util/debug.h @@ -44,26 +44,20 @@ namespace cutlass { * Formats and prints the given message to stdout */ #if !defined(CUDA_LOG) - #if defined(__clang__) && defined(__CUDA__) -static __device__ void cuda_log_location() { - printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y, - blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z); -} -static __host__ void cuda_log_location() {} - #define CUDA_LOG(format, ...) \ - do { \ - cuda_log_location(); \ - printf(format, __VA_ARGS__); \ - } while (0) - #else // NVCC - #if !defined(__CUDA_ARCH__) - #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) - #else - #define CUDA_LOG(format, ...) \ - printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ - blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, \ - threadIdx.y, threadIdx.z, __VA_ARGS__); - #endif + #if !defined(__CUDA_ARCH__) + #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) + #else +inline __host__ __device__ unsigned get_threadidx_x() { return threadIdx.x; } +inline __host__ __device__ unsigned get_threadidx_y() { return threadIdx.y; } +inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; } +inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; } +inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; } +inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; } + #define CUDA_LOG(format, ...) \ + printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + get_blockidx_x(), get_blockidx_y(), get_blockidx_z(), \ + get_threadidx_x(), get_threadidx_y(), get_threadidx_z(), \ + __VA_ARGS__); #endif #endif From 1c9b54df16c064f0cb964d1e5469383e9930dadc Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 3 Jan 2018 16:42:51 -0800 Subject: [PATCH 6/6] Whitespace fix. --- cutlass/util/debug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h index c10e447f..2aedd17a 100644 --- a/cutlass/util/debug.h +++ b/cutlass/util/debug.h @@ -53,7 +53,7 @@ inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; } inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; } inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; } inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; } - #define CUDA_LOG(format, ...) \ + #define CUDA_LOG(format, ...) \ printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ get_blockidx_x(), get_blockidx_y(), get_blockidx_z(), \ get_threadidx_x(), get_threadidx_y(), get_threadidx_z(), \