From e9e7cd4d44f5ce2f8be0e7d6f80b76346e9e5312 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 11 Dec 2017 14:16:52 -0800
Subject: [PATCH 1/6] Make cutlass compilable with clang.

E.g:
PATH=/nvcc/path/bin:/clang/path/bin:$PATH make sm=35,60 compiler=clang all
---
 common.mk | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/common.mk b/common.mk
index bdf7559b..8ef8fe9d 100644
--- a/common.mk
+++ b/common.mk
@@ -41,51 +41,64 @@ endif
 
 ifeq (70, $(findstring 70, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_70
 endif
 ifeq (62, $(findstring 62, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_62
 endif
 ifeq (61, $(findstring 61, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_61
 endif
 ifeq (60, $(findstring 60, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_60
 endif
 ifeq (52, $(findstring 52, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_52
 endif
 ifeq (37, $(findstring 37, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_37
 endif
 ifeq (35, $(findstring 35, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_35
 endif
 ifeq (30, $(findstring 30, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_30
 endif
 ifeq (21, $(findstring 21, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_21,compute_20\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_21
 endif
 ifeq (20, $(findstring 20, $(SM_ARCH)))
     SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+    CLANG_SM_TARGETS += --cuda-gpu-arch=sm_20
 endif
 
 
 # [verbose=<0|1>] Verbose toolchain output from nvcc option
 ifeq ($(verbose), 1)
 	NVCCFLAGS += -v
+	CLANG_CFLAGS += -v
 endif
 
 
 # [keep=<0|1>] Keep intermediate compilation artifacts option
 ifeq ($(keep), 1)
 	NVCCFLAGS += -keep
+	CLANG_CFLAGS += --save-temps
 endif
 
 
 # [debug=<0|1>] Generate debug mode code
 ifeq ($(debug), 1)
 	NVCCFLAGS += -G
+	CLANG_CFLAGS += --cuda-noopt-device-debug
 endif
 
 
@@ -107,7 +120,7 @@ OSUPPER := $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 
 # Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
 NVCCFLAGS += -O3 -Xptxas -v
-
+CLANG_CFLAGS += -O3 -Xcuda-ptxas -v
 ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
     # For MSVC
 
@@ -139,9 +152,25 @@ else
 
 endif
 
+# compiler=clang Enables compilation with clang. 
 
-# Suffix to append to each binary
-BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION)
+ifeq ($(compiler), clang)
+	BIN_SUFFIX := sm$(SM_ARCH)_clang_$(NVCC_VERSION)
+	# Clangs needs few extra flags to point it to CUDA SDK
+	# and link the binaries with CUDA runtime.
+	CUDA_BASE=$(realpath $(join $(dir $(shell which nvcc)), ..))
+	CLANG_CFLAGS += --cuda-path=$(CUDA_BASE)
+	LIBINC += -L$(CUDA_BASE)/lib64 -Wl,-rpath=$(CUDA_BASE)/lib64
+	LIBS   += -lcudart
+
+	# Replace NVCC and its options with clang++.
+	NVCC = clang++
+	NVCCFLAGS = $(CLANG_CFLAGS)
+	SM_TARGETS = $(CLANG_SM_TARGETS)
+else
+	# Suffix to append to each binary
+	BIN_SUFFIX := sm$(SM_ARCH)_nvcc_$(NVCC_VERSION)
+endif
 
 
 #-------------------------------------------------------------------------------
@@ -149,5 +178,3 @@ BIN_SUFFIX := sm$(SM_ARCH)_$(NVCC_VERSION)
 #-------------------------------------------------------------------------------
 
 rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-

From ce2b3f695dc05e68348f2e1e14d9479c3578772c Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Fri, 8 Dec 2017 10:39:55 -0800
Subject: [PATCH 2/6] Fixed debug macros for clang.

Unlike nvcc, clang always sees both host and device-side code during
compilation. CUDA_LOG macro is used in both host and device code, so when it
expanded to contain device-only code, that resulted in errors when it was used
from the host-side functions.

In order to make CUDA_LOG work with clang it was split into two parts -- a pair
of target-attribute-based overloaded functions that perform host or device
specific parts of logging, and a printf which works on both sides.
---
 cutlass/util/debug.h | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h
index 81650932..3a4b2fd0 100644
--- a/cutlass/util/debug.h
+++ b/cutlass/util/debug.h
@@ -44,10 +44,26 @@ namespace cutlass {
  * Formats and prints the given message to stdout
  */
 #if !defined(CUDA_LOG)
-    #if !defined(__CUDA_ARCH__)
-        #define CUDA_LOG(format, ...) printf(format,__VA_ARGS__)
-    #else
-        #define CUDA_LOG(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, __VA_ARGS__);
+    #if defined(__clang__) && defined(__CUDA__)
+static __device__ void cuda_log_location() {
+    printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y,
+           blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
+}
+static __host__ void cuda_log_location() {}
+        #define CUDA_LOG(format, ...)                                          \
+            do {                                                               \
+                cuda_log_location();                                           \
+                printf(format, __VA_ARGS__);                                   \
+            } while (0)
+    #else // NVCC
+        #if !defined(__CUDA_ARCH__)
+            #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
+        #else
+            #define CUDA_LOG(format, ...)                                      \
+                printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format,       \
+                       blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,        \
+                       threadIdx.y, threadIdx.z, __VA_ARGS__);
+        #endif
     #endif
 #endif
 

From 81957b3a3ddd5b43683f3e16ef43b9d0a53f5d79 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Fri, 8 Dec 2017 16:59:32 -0800
Subject: [PATCH 3/6] Force inlining of few functions that rely on that for
 performance.

Clang is less agressive than nvccnvcc, so number of functions did not getn
inlined into the kernel by default. That prevented SROA from eliminating
loads/stores to temporary buffers and resulted in abysmal performance.

Replaced inline with __forceinline__ to ensure that we do inline the
functions necessary for optimal performance.
---
 cutlass/gemm/block_task.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cutlass/gemm/block_task.h b/cutlass/gemm/block_task.h
index d5700c50..3940fb6e 100644
--- a/cutlass/gemm/block_task.h
+++ b/cutlass/gemm/block_task.h
@@ -484,7 +484,8 @@ struct block_task
      *   - Applies the scalar multipliers and addends to the accumulators
      *   - Write the result to the output matrix
      */
-    inline __device__ void epilogue()
+    __forceinline__ __device__
+    void epilogue()
     {
         // Wait for predecessor thread block(s) to produce block-wide tile of
         // exclsuive partial-sums
@@ -551,7 +552,7 @@ struct block_task
      * Consume a tile of A and B each
      */
     template <bool DoGlobalPrefetch>
-    inline __device__
+    __forceinline__ __device__
     void consume_tile()
     {
         // Unroll BlockDpVectorsK iterations of outer-product accumulations
@@ -612,7 +613,7 @@ struct block_task
     /**
      * Compute GEMM
      */
-    inline __device__
+    __forceinline__ __device__
     void run()
     {
         // Quit if the thread block is fully out-of-bounds

From df4b4e4bb6a084127e61e6bdd1678eb19e26af94 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 11 Dec 2017 16:34:10 -0800
Subject: [PATCH 4/6] Added _cuda_ to the name of the executable to indicate
 that it's not clang's version.

---
 common.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common.mk b/common.mk
index 8ef8fe9d..672ea5b2 100644
--- a/common.mk
+++ b/common.mk
@@ -155,7 +155,8 @@ endif
 # compiler=clang Enables compilation with clang. 
 
 ifeq ($(compiler), clang)
-	BIN_SUFFIX := sm$(SM_ARCH)_clang_$(NVCC_VERSION)
+	# NVCC_VERSION is used as the proxy for the CUDA version.
+	BIN_SUFFIX := sm$(SM_ARCH)_clang_cuda_$(NVCC_VERSION)
 	# Clangs needs few extra flags to point it to CUDA SDK
 	# and link the binaries with CUDA runtime.
 	CUDA_BASE=$(realpath $(join $(dir $(shell which nvcc)), ..))

From 39616514d0d0b7cadbaabffeb55f863774b282db Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Wed, 3 Jan 2018 16:36:50 -0800
Subject: [PATCH 5/6] Reworked CUDA_LOG macro to print location&the message
 with one printf.

This replies on the fact that clang allows using device-side features
from __host__/__device__ functions from __host__ ones as long as we
don't have to generate code for that. Wrapping thread/blockIdx in
__host__ __device__ function allows using CUDA_LOG everywhere during
host and device compilation.
---
 cutlass/util/debug.h | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h
index 3a4b2fd0..c10e447f 100644
--- a/cutlass/util/debug.h
+++ b/cutlass/util/debug.h
@@ -44,26 +44,20 @@ namespace cutlass {
  * Formats and prints the given message to stdout
  */
 #if !defined(CUDA_LOG)
-    #if defined(__clang__) && defined(__CUDA__)
-static __device__ void cuda_log_location() {
-    printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y,
-           blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
-}
-static __host__ void cuda_log_location() {}
-        #define CUDA_LOG(format, ...)                                          \
-            do {                                                               \
-                cuda_log_location();                                           \
-                printf(format, __VA_ARGS__);                                   \
-            } while (0)
-    #else // NVCC
-        #if !defined(__CUDA_ARCH__)
-            #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
-        #else
-            #define CUDA_LOG(format, ...)                                      \
-                printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format,       \
-                       blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,        \
-                       threadIdx.y, threadIdx.z, __VA_ARGS__);
-        #endif
+    #if !defined(__CUDA_ARCH__)
+        #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
+    #else
+inline __host__ __device__ unsigned get_threadidx_x() { return threadIdx.x; }
+inline __host__ __device__ unsigned get_threadidx_y() { return threadIdx.y; }
+inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; }
+inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; }
+inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; }
+inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; }
+    #define CUDA_LOG(format, ...)                                              \
+            printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format,           \
+                   get_blockidx_x(), get_blockidx_y(), get_blockidx_z(),       \
+                   get_threadidx_x(), get_threadidx_y(), get_threadidx_z(),    \
+                   __VA_ARGS__);
     #endif
 #endif
 

From 1c9b54df16c064f0cb964d1e5469383e9930dadc Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Wed, 3 Jan 2018 16:42:51 -0800
Subject: [PATCH 6/6] Whitespace fix.

---
 cutlass/util/debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h
index c10e447f..2aedd17a 100644
--- a/cutlass/util/debug.h
+++ b/cutlass/util/debug.h
@@ -53,7 +53,7 @@ inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; }
 inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; }
 inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; }
 inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; }
-    #define CUDA_LOG(format, ...)                                              \
+        #define CUDA_LOG(format, ...)                                          \
             printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format,           \
                    get_blockidx_x(), get_blockidx_y(), get_blockidx_z(),       \
                    get_threadidx_x(), get_threadidx_y(), get_threadidx_z(),    \