Reworked CUDA_LOG macro to print location&the message with one printf.

This replies on the fact that clang allows using device-side features
from __host__/__device__ functions from __host__ ones as long as we
don't have to generate code for that. Wrapping thread/blockIdx in
__host__ __device__ function allows using CUDA_LOG everywhere during
host and device compilation.
This commit is contained in:
Artem Belevich 2018-01-03 16:36:50 -08:00
parent df4b4e4bb6
commit 39616514d0

View File

@ -44,26 +44,20 @@ namespace cutlass {
* Formats and prints the given message to stdout * Formats and prints the given message to stdout
*/ */
#if !defined(CUDA_LOG) #if !defined(CUDA_LOG)
#if defined(__clang__) && defined(__CUDA__) #if !defined(__CUDA_ARCH__)
static __device__ void cuda_log_location() { #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y, #else
blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z); inline __host__ __device__ unsigned get_threadidx_x() { return threadIdx.x; }
} inline __host__ __device__ unsigned get_threadidx_y() { return threadIdx.y; }
static __host__ void cuda_log_location() {} inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; }
#define CUDA_LOG(format, ...) \ inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; }
do { \ inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; }
cuda_log_location(); \ inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; }
printf(format, __VA_ARGS__); \ #define CUDA_LOG(format, ...) \
} while (0) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
#else // NVCC get_blockidx_x(), get_blockidx_y(), get_blockidx_z(), \
#if !defined(__CUDA_ARCH__) get_threadidx_x(), get_threadidx_y(), get_threadidx_z(), \
#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) __VA_ARGS__);
#else
#define CUDA_LOG(format, ...) \
printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, \
threadIdx.y, threadIdx.z, __VA_ARGS__);
#endif
#endif #endif
#endif #endif