Fixed debug macros for clang.

Unlike nvcc, clang always sees both host and device-side code during
compilation. CUDA_LOG macro is used in both host and device code, so when it
expanded to contain device-only code, that resulted in errors when it was used
from the host-side functions.

In order to make CUDA_LOG work with clang it was split into two parts -- a pair
of target-attribute-based overloaded functions that perform host or device
specific parts of logging, and a printf which works on both sides.
This commit is contained in:
Artem Belevich 2017-12-08 10:39:55 -08:00
parent e9e7cd4d44
commit ce2b3f695d

View File

@ -44,10 +44,26 @@ namespace cutlass {
* Formats and prints the given message to stdout
*/
#if !defined(CUDA_LOG)
#if defined(__clang__) && defined(__CUDA__)
static __device__ void cuda_log_location() {
printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y,
blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
}
static __host__ void cuda_log_location() {}
#define CUDA_LOG(format, ...) \
do { \
cuda_log_location(); \
printf(format, __VA_ARGS__); \
} while (0)
#else // NVCC
#if !defined(__CUDA_ARCH__)
#define CUDA_LOG(format, ...) printf(format,__VA_ARGS__)
#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
#else
#define CUDA_LOG(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, __VA_ARGS__);
#define CUDA_LOG(format, ...) \
printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, \
threadIdx.y, threadIdx.z, __VA_ARGS__);
#endif
#endif
#endif