From 8f1fe7a13298389a55cf3d4fe1f63215fcc445ba Mon Sep 17 00:00:00 2001 From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com> Date: Tue, 22 Mar 2022 17:21:18 +0100 Subject: [PATCH] Fix separate compilation `-dc` (#433) * Fix separate compilation `-dc` - when cutlass is included in multiple compilation units compiled with `-dc` OOB_NAN_F16x8 device constant is instantiated multiple times causing Multiple definition of '_ZN7cutlass4arch13OOB_NAN_F16x8E' error This PR makes this variable a local constant as it is not modified during runtime Signed-off-by: Janusz Lisiecki * Fix Signed-off-by: Janusz Lisiecki * Test GH Signed-off-by: Janusz Lisiecki * Revert test GH Signed-off-by: Janusz Lisiecki --- include/cutlass/arch/memory_sm80.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h index f26e05e3..01820adf 100644 --- a/include/cutlass/arch/memory_sm80.h +++ b/include/cutlass/arch/memory_sm80.h @@ -92,11 +92,11 @@ struct cp_async { CUTLASS_DEVICE cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { #if CUDA_CP_ASYNC_ACTIVATED - + // Make sure the size is supported. static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), "Size is not supported"); - + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); asm volatile( @@ -135,7 +135,7 @@ struct cp_async_zfill { // Make sure the size is supported. static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), "Size is not supported"); - + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); int src_in_bytes = (pred_guard ? SizeInBytes : 0); @@ -162,9 +162,6 @@ struct cp_async_zfill { } }; -__device__ __constant__ uint4 OOB_NAN_F16x8 = {0x7eff7eff, 0x7eff7eff, - 0x7eff7eff, 0x7eff7eff}; - /// Partial specialization template <> struct cp_async_nan<16, CacheOperation::Always> { @@ -174,7 +171,10 @@ struct cp_async_nan<16, CacheOperation::Always> { CUTLASS_DEVICE cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) { #if CUDA_CP_ASYNC_ACTIVATED - + + static __constant__ uint4 OOB_NAN_F16x8 = {0x7eff7eff, 0x7eff7eff, + 0x7eff7eff, 0x7eff7eff}; + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); asm volatile( @@ -216,7 +216,7 @@ struct cp_async { CUTLASS_DEVICE cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { #if CUDA_CP_ASYNC_ACTIVATED - + static_assert(SizeInBytes == 16, "cp.async only supports CacheOperation::Global when access size is 16B.");