From 8f1fe7a13298389a55cf3d4fe1f63215fcc445ba Mon Sep 17 00:00:00 2001
From: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Date: Tue, 22 Mar 2022 17:21:18 +0100
Subject: [PATCH] Fix separate compilation `-dc` (#433)

* Fix separate compilation `-dc`

- when cutlass is included in multiple compilation units
  compiled with `-dc` OOB_NAN_F16x8 device constant is
  instantiated multiple times causing
  Multiple definition of '_ZN7cutlass4arch13OOB_NAN_F16x8E' error
  This PR makes this variable a local constant as it is not
  modified during runtime

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Test GH

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Revert test GH

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
---
 include/cutlass/arch/memory_sm80.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
index f26e05e3..01820adf 100644
--- a/include/cutlass/arch/memory_sm80.h
+++ b/include/cutlass/arch/memory_sm80.h
@@ -92,11 +92,11 @@ struct cp_async<SizeInBytes, CacheOperation::Always> {
   CUTLASS_DEVICE
   cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
     #if CUDA_CP_ASYNC_ACTIVATED
- 
+
       // Make sure the size is supported.
       static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
                 "Size is not supported");
-   
+
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
 
       asm volatile(
@@ -135,7 +135,7 @@ struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
       // Make sure the size is supported.
       static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
                 "Size is not supported");
-   
+
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
       int src_in_bytes = (pred_guard ? SizeInBytes : 0);
 
@@ -162,9 +162,6 @@ struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
   }
 };
 
-__device__ __constant__ uint4 OOB_NAN_F16x8 = {0x7eff7eff, 0x7eff7eff,
-                                               0x7eff7eff, 0x7eff7eff};
-
 /// Partial specialization
 template <>
 struct cp_async_nan<16, CacheOperation::Always> {
@@ -174,7 +171,10 @@ struct cp_async_nan<16, CacheOperation::Always> {
   CUTLASS_DEVICE
   cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
     #if CUDA_CP_ASYNC_ACTIVATED
-    
+
+      static __constant__ uint4 OOB_NAN_F16x8 = {0x7eff7eff, 0x7eff7eff,
+                                                 0x7eff7eff, 0x7eff7eff};
+
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
 
       asm volatile(
@@ -216,7 +216,7 @@ struct cp_async<SizeInBytes, CacheOperation::Global> {
   CUTLASS_DEVICE
   cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
     #if CUDA_CP_ASYNC_ACTIVATED
-    
+
       static_assert(SizeInBytes == 16,
         "cp.async only supports CacheOperation::Global when access size is 16B.");