diff --git a/common.mk b/common.mk index bdf7559b..672ea5b2 100644 --- a/common.mk +++ b/common.mk @@ -41,51 +41,64 @@ endif ifeq (70, $(findstring 70, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_70 endif ifeq (62, $(findstring 62, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_62 endif ifeq (61, $(findstring 61, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_61 endif ifeq (60, $(findstring 60, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_60 endif ifeq (52, $(findstring 52, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_52 endif ifeq (37, $(findstring 37, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_37 endif ifeq (35, $(findstring 35, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_35 endif ifeq (30, $(findstring 30, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_30 endif ifeq (21, $(findstring 21, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_20,code=\"sm_21,compute_20\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_21 endif ifeq (20, $(findstring 20, $(SM_ARCH))) SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\" + CLANG_SM_TARGETS += --cuda-gpu-arch=sm_20 endif # [verbose=<0|1>] Verbose toolchain output from nvcc option ifeq ($(verbose), 1) NVCCFLAGS += -v + CLANG_CFLAGS += -v endif # [keep=<0|1>] Keep intermediate compilation artifacts option ifeq ($(keep), 1) NVCCFLAGS += -keep + CLANG_CFLAGS += --save-temps endif # [debug=<0|1>] Generate debug mode code ifeq ($(debug), 1) NVCCFLAGS += -G + CLANG_CFLAGS += --cuda-noopt-device-debug endif @@ -107,7 +120,7 @@ OSUPPER := $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) # Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases NVCCFLAGS += -O3 -Xptxas -v - +CLANG_CFLAGS += -O3 -Xcuda-ptxas -v ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER))) # For MSVC @@ -139,9 +152,26 @@ else endif +# compiler=clang Enables compilation with clang. -# Suffix to append to each binary -BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION) +ifeq ($(compiler), clang) + # NVCC_VERSION is used as the proxy for the CUDA version. + BIN_SUFFIX := sm$(SM_ARCH)_clang_cuda_$(NVCC_VERSION) + # Clangs needs few extra flags to point it to CUDA SDK + # and link the binaries with CUDA runtime. + CUDA_BASE=$(realpath $(join $(dir $(shell which nvcc)), ..)) + CLANG_CFLAGS += --cuda-path=$(CUDA_BASE) + LIBINC += -L$(CUDA_BASE)/lib64 -Wl,-rpath=$(CUDA_BASE)/lib64 + LIBS += -lcudart + + # Replace NVCC and its options with clang++. + NVCC = clang++ + NVCCFLAGS = $(CLANG_CFLAGS) + SM_TARGETS = $(CLANG_SM_TARGETS) +else + # Suffix to append to each binary + BIN_SUFFIX := sm$(SM_ARCH)_nvcc_$(NVCC_VERSION) +endif #------------------------------------------------------------------------------- @@ -149,5 +179,3 @@ BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION) #------------------------------------------------------------------------------- rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) - - diff --git a/cutlass/gemm/block_task.h b/cutlass/gemm/block_task.h index d5700c50..3940fb6e 100644 --- a/cutlass/gemm/block_task.h +++ b/cutlass/gemm/block_task.h @@ -484,7 +484,8 @@ struct block_task * - Applies the scalar multipliers and addends to the accumulators * - Write the result to the output matrix */ - inline __device__ void epilogue() + __forceinline__ __device__ + void epilogue() { // Wait for predecessor thread block(s) to produce block-wide tile of // exclsuive partial-sums @@ -551,7 +552,7 @@ struct block_task * Consume a tile of A and B each */ template - inline __device__ + __forceinline__ __device__ void consume_tile() { // Unroll BlockDpVectorsK iterations of outer-product accumulations @@ -612,7 +613,7 @@ struct block_task /** * Compute GEMM */ - inline __device__ + __forceinline__ __device__ void run() { // Quit if the thread block is fully out-of-bounds diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h index 81650932..2aedd17a 100644 --- a/cutlass/util/debug.h +++ b/cutlass/util/debug.h @@ -45,9 +45,19 @@ namespace cutlass { */ #if !defined(CUDA_LOG) #if !defined(__CUDA_ARCH__) - #define CUDA_LOG(format, ...) printf(format,__VA_ARGS__) + #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) #else - #define CUDA_LOG(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, __VA_ARGS__); +inline __host__ __device__ unsigned get_threadidx_x() { return threadIdx.x; } +inline __host__ __device__ unsigned get_threadidx_y() { return threadIdx.y; } +inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; } +inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; } +inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; } +inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; } + #define CUDA_LOG(format, ...) \ + printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + get_blockidx_x(), get_blockidx_y(), get_blockidx_z(), \ + get_threadidx_x(), get_threadidx_y(), get_threadidx_z(), \ + __VA_ARGS__); #endif #endif