Merge branch 'Artem-B-clang-fixes'
This commit is contained in:
commit
901287175f
36
common.mk
36
common.mk
@ -41,51 +41,64 @@ endif
|
|||||||
|
|
||||||
ifeq (70, $(findstring 70, $(SM_ARCH)))
|
ifeq (70, $(findstring 70, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\"
|
SM_TARGETS += -gencode=arch=compute_70,code=\"sm_70,compute_70\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_70
|
||||||
endif
|
endif
|
||||||
ifeq (62, $(findstring 62, $(SM_ARCH)))
|
ifeq (62, $(findstring 62, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\"
|
SM_TARGETS += -gencode=arch=compute_62,code=\"sm_62,compute_62\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_62
|
||||||
endif
|
endif
|
||||||
ifeq (61, $(findstring 61, $(SM_ARCH)))
|
ifeq (61, $(findstring 61, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
|
SM_TARGETS += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_61
|
||||||
endif
|
endif
|
||||||
ifeq (60, $(findstring 60, $(SM_ARCH)))
|
ifeq (60, $(findstring 60, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
|
SM_TARGETS += -gencode=arch=compute_60,code=\"sm_60,compute_60\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_60
|
||||||
endif
|
endif
|
||||||
ifeq (52, $(findstring 52, $(SM_ARCH)))
|
ifeq (52, $(findstring 52, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
|
SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_52
|
||||||
endif
|
endif
|
||||||
ifeq (37, $(findstring 37, $(SM_ARCH)))
|
ifeq (37, $(findstring 37, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\"
|
SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_37
|
||||||
endif
|
endif
|
||||||
ifeq (35, $(findstring 35, $(SM_ARCH)))
|
ifeq (35, $(findstring 35, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
|
SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_35
|
||||||
endif
|
endif
|
||||||
ifeq (30, $(findstring 30, $(SM_ARCH)))
|
ifeq (30, $(findstring 30, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
|
SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_30
|
||||||
endif
|
endif
|
||||||
ifeq (21, $(findstring 21, $(SM_ARCH)))
|
ifeq (21, $(findstring 21, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_21,compute_20\"
|
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_21,compute_20\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_21
|
||||||
endif
|
endif
|
||||||
ifeq (20, $(findstring 20, $(SM_ARCH)))
|
ifeq (20, $(findstring 20, $(SM_ARCH)))
|
||||||
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\"
|
SM_TARGETS += -gencode=arch=compute_20,code=\"sm_20,compute_20\"
|
||||||
|
CLANG_SM_TARGETS += --cuda-gpu-arch=sm_20
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
# [verbose=<0|1>] Verbose toolchain output from nvcc option
|
# [verbose=<0|1>] Verbose toolchain output from nvcc option
|
||||||
ifeq ($(verbose), 1)
|
ifeq ($(verbose), 1)
|
||||||
NVCCFLAGS += -v
|
NVCCFLAGS += -v
|
||||||
|
CLANG_CFLAGS += -v
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
# [keep=<0|1>] Keep intermediate compilation artifacts option
|
# [keep=<0|1>] Keep intermediate compilation artifacts option
|
||||||
ifeq ($(keep), 1)
|
ifeq ($(keep), 1)
|
||||||
NVCCFLAGS += -keep
|
NVCCFLAGS += -keep
|
||||||
|
CLANG_CFLAGS += --save-temps
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
# [debug=<0|1>] Generate debug mode code
|
# [debug=<0|1>] Generate debug mode code
|
||||||
ifeq ($(debug), 1)
|
ifeq ($(debug), 1)
|
||||||
NVCCFLAGS += -G
|
NVCCFLAGS += -G
|
||||||
|
CLANG_CFLAGS += --cuda-noopt-device-debug
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
@ -107,7 +120,7 @@ OSUPPER := $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
|
|||||||
|
|
||||||
# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
|
# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
|
||||||
NVCCFLAGS += -O3 -Xptxas -v
|
NVCCFLAGS += -O3 -Xptxas -v
|
||||||
|
CLANG_CFLAGS += -O3 -Xcuda-ptxas -v
|
||||||
ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
|
ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
|
||||||
# For MSVC
|
# For MSVC
|
||||||
|
|
||||||
@ -139,9 +152,26 @@ else
|
|||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# compiler=clang Enables compilation with clang.
|
||||||
|
|
||||||
|
ifeq ($(compiler), clang)
|
||||||
|
# NVCC_VERSION is used as the proxy for the CUDA version.
|
||||||
|
BIN_SUFFIX := sm$(SM_ARCH)_clang_cuda_$(NVCC_VERSION)
|
||||||
|
# Clangs needs few extra flags to point it to CUDA SDK
|
||||||
|
# and link the binaries with CUDA runtime.
|
||||||
|
CUDA_BASE=$(realpath $(join $(dir $(shell which nvcc)), ..))
|
||||||
|
CLANG_CFLAGS += --cuda-path=$(CUDA_BASE)
|
||||||
|
LIBINC += -L$(CUDA_BASE)/lib64 -Wl,-rpath=$(CUDA_BASE)/lib64
|
||||||
|
LIBS += -lcudart
|
||||||
|
|
||||||
|
# Replace NVCC and its options with clang++.
|
||||||
|
NVCC = clang++
|
||||||
|
NVCCFLAGS = $(CLANG_CFLAGS)
|
||||||
|
SM_TARGETS = $(CLANG_SM_TARGETS)
|
||||||
|
else
|
||||||
# Suffix to append to each binary
|
# Suffix to append to each binary
|
||||||
BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION)
|
BIN_SUFFIX := sm$(SM_ARCH)_nvcc_$(NVCC_VERSION)
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
@ -149,5 +179,3 @@ BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION)
|
|||||||
#-------------------------------------------------------------------------------
|
#-------------------------------------------------------------------------------
|
||||||
|
|
||||||
rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
|
rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
|
||||||
|
|
||||||
|
|
||||||
|
@ -484,7 +484,8 @@ struct block_task
|
|||||||
* - Applies the scalar multipliers and addends to the accumulators
|
* - Applies the scalar multipliers and addends to the accumulators
|
||||||
* - Write the result to the output matrix
|
* - Write the result to the output matrix
|
||||||
*/
|
*/
|
||||||
inline __device__ void epilogue()
|
__forceinline__ __device__
|
||||||
|
void epilogue()
|
||||||
{
|
{
|
||||||
// Wait for predecessor thread block(s) to produce block-wide tile of
|
// Wait for predecessor thread block(s) to produce block-wide tile of
|
||||||
// exclsuive partial-sums
|
// exclsuive partial-sums
|
||||||
@ -551,7 +552,7 @@ struct block_task
|
|||||||
* Consume a tile of A and B each
|
* Consume a tile of A and B each
|
||||||
*/
|
*/
|
||||||
template <bool DoGlobalPrefetch>
|
template <bool DoGlobalPrefetch>
|
||||||
inline __device__
|
__forceinline__ __device__
|
||||||
void consume_tile()
|
void consume_tile()
|
||||||
{
|
{
|
||||||
// Unroll BlockDpVectorsK iterations of outer-product accumulations
|
// Unroll BlockDpVectorsK iterations of outer-product accumulations
|
||||||
@ -612,7 +613,7 @@ struct block_task
|
|||||||
/**
|
/**
|
||||||
* Compute GEMM
|
* Compute GEMM
|
||||||
*/
|
*/
|
||||||
inline __device__
|
__forceinline__ __device__
|
||||||
void run()
|
void run()
|
||||||
{
|
{
|
||||||
// Quit if the thread block is fully out-of-bounds
|
// Quit if the thread block is fully out-of-bounds
|
||||||
|
@ -47,7 +47,17 @@ namespace cutlass {
|
|||||||
#if !defined(__CUDA_ARCH__)
|
#if !defined(__CUDA_ARCH__)
|
||||||
#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
|
#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
|
||||||
#else
|
#else
|
||||||
#define CUDA_LOG(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, __VA_ARGS__);
|
inline __host__ __device__ unsigned get_threadidx_x() { return threadIdx.x; }
|
||||||
|
inline __host__ __device__ unsigned get_threadidx_y() { return threadIdx.y; }
|
||||||
|
inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; }
|
||||||
|
inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; }
|
||||||
|
inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; }
|
||||||
|
inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; }
|
||||||
|
#define CUDA_LOG(format, ...) \
|
||||||
|
printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
|
||||||
|
get_blockidx_x(), get_blockidx_y(), get_blockidx_z(), \
|
||||||
|
get_threadidx_x(), get_threadidx_y(), get_threadidx_z(), \
|
||||||
|
__VA_ARGS__);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user