From ce2b3f695dc05e68348f2e1e14d9479c3578772c Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Fri, 8 Dec 2017 10:39:55 -0800
Subject: [PATCH] Fixed debug macros for clang.

Unlike nvcc, clang always sees both host and device-side code during
compilation. CUDA_LOG macro is used in both host and device code, so when it
expanded to contain device-only code, that resulted in errors when it was used
from the host-side functions.

In order to make CUDA_LOG work with clang it was split into two parts -- a pair
of target-attribute-based overloaded functions that perform host or device
specific parts of logging, and a printf which works on both sides.
---
 cutlass/util/debug.h | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h
index 81650932..3a4b2fd0 100644
--- a/cutlass/util/debug.h
+++ b/cutlass/util/debug.h
@@ -44,10 +44,26 @@ namespace cutlass {
  * Formats and prints the given message to stdout
  */
 #if !defined(CUDA_LOG)
-    #if !defined(__CUDA_ARCH__)
-        #define CUDA_LOG(format, ...) printf(format,__VA_ARGS__)
-    #else
-        #define CUDA_LOG(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, __VA_ARGS__);
+    #if defined(__clang__) && defined(__CUDA__)
+static __device__ void cuda_log_location() {
+    printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y,
+           blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
+}
+static __host__ void cuda_log_location() {}
+        #define CUDA_LOG(format, ...)                                          \
+            do {                                                               \
+                cuda_log_location();                                           \
+                printf(format, __VA_ARGS__);                                   \
+            } while (0)
+    #else // NVCC
+        #if !defined(__CUDA_ARCH__)
+            #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
+        #else
+            #define CUDA_LOG(format, ...)                                      \
+                printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format,       \
+                       blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,        \
+                       threadIdx.y, threadIdx.z, __VA_ARGS__);
+        #endif
     #endif
 #endif