From 39616514d0d0b7cadbaabffeb55f863774b282db Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 3 Jan 2018 16:36:50 -0800 Subject: [PATCH] Reworked CUDA_LOG macro to print location&the message with one printf. This replies on the fact that clang allows using device-side features from __host__/__device__ functions from __host__ ones as long as we don't have to generate code for that. Wrapping thread/blockIdx in __host__ __device__ function allows using CUDA_LOG everywhere during host and device compilation. --- cutlass/util/debug.h | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/cutlass/util/debug.h b/cutlass/util/debug.h index 3a4b2fd0..c10e447f 100644 --- a/cutlass/util/debug.h +++ b/cutlass/util/debug.h @@ -44,26 +44,20 @@ namespace cutlass { * Formats and prints the given message to stdout */ #if !defined(CUDA_LOG) - #if defined(__clang__) && defined(__CUDA__) -static __device__ void cuda_log_location() { - printf("[block (%d,%d,%d), thread (%d,%d,%d)]: ", blockIdx.x, blockIdx.y, - blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z); -} -static __host__ void cuda_log_location() {} - #define CUDA_LOG(format, ...) \ - do { \ - cuda_log_location(); \ - printf(format, __VA_ARGS__); \ - } while (0) - #else // NVCC - #if !defined(__CUDA_ARCH__) - #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) - #else - #define CUDA_LOG(format, ...) \ - printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ - blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, \ - threadIdx.y, threadIdx.z, __VA_ARGS__); - #endif + #if !defined(__CUDA_ARCH__) + #define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) + #else +inline __host__ __device__ unsigned get_threadidx_x() { return threadIdx.x; } +inline __host__ __device__ unsigned get_threadidx_y() { return threadIdx.y; } +inline __host__ __device__ unsigned get_threadidx_z() { return threadIdx.z; } +inline __host__ __device__ unsigned get_blockidx_x() { return blockIdx.x; } +inline __host__ __device__ unsigned get_blockidx_y() { return blockIdx.y; } +inline __host__ __device__ unsigned get_blockidx_z() { return blockIdx.z; } + #define CUDA_LOG(format, ...) \ + printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + get_blockidx_x(), get_blockidx_y(), get_blockidx_z(), \ + get_threadidx_x(), get_threadidx_y(), get_threadidx_z(), \ + __VA_ARGS__); #endif #endif