Fix separate compilation -dc (#433)

* Fix separate compilation `-dc`

- when cutlass is included in multiple compilation units
  compiled with `-dc` OOB_NAN_F16x8 device constant is
  instantiated multiple times causing
  Multiple definition of '_ZN7cutlass4arch13OOB_NAN_F16x8E' error
  This PR makes this variable a local constant as it is not
  modified during runtime

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Fix

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Test GH

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>

* Revert test GH

Signed-off-by: Janusz Lisiecki <jlisiecki@nvidia.com>
This commit is contained in:
Janusz Lisiecki 2022-03-22 17:21:18 +01:00 committed by GitHub
parent 3ab1eacf09
commit 8f1fe7a132
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -92,11 +92,11 @@ struct cp_async<SizeInBytes, CacheOperation::Always> {
CUTLASS_DEVICE
cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
#if CUDA_CP_ASYNC_ACTIVATED
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
asm volatile(
@ -135,7 +135,7 @@ struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
int src_in_bytes = (pred_guard ? SizeInBytes : 0);
@ -162,9 +162,6 @@ struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
}
};
__device__ __constant__ uint4 OOB_NAN_F16x8 = {0x7eff7eff, 0x7eff7eff,
0x7eff7eff, 0x7eff7eff};
/// Partial specialization
template <>
struct cp_async_nan<16, CacheOperation::Always> {
@ -174,7 +171,10 @@ struct cp_async_nan<16, CacheOperation::Always> {
CUTLASS_DEVICE
cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
#if CUDA_CP_ASYNC_ACTIVATED
static __constant__ uint4 OOB_NAN_F16x8 = {0x7eff7eff, 0x7eff7eff,
0x7eff7eff, 0x7eff7eff};
unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
asm volatile(
@ -216,7 +216,7 @@ struct cp_async<SizeInBytes, CacheOperation::Global> {
CUTLASS_DEVICE
cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
#if CUDA_CP_ASYNC_ACTIVATED
static_assert(SizeInBytes == 16,
"cp.async only supports CacheOperation::Global when access size is 16B.");