diff --git a/include/cute/arch/copy_sm80.hpp b/include/cute/arch/copy_sm80.hpp index 7dd12de2..cf408b8e 100644 --- a/include/cute/arch/copy_sm80.hpp +++ b/include/cute/arch/copy_sm80.hpp @@ -59,7 +59,7 @@ struct SM80_CP_ASYNC_CACHEALWAYS #if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) TS const* gmem_ptr = &gmem_src; uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); - asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n" + asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2;\n" :: "r"(smem_int_ptr), "l"(gmem_ptr), "n"(sizeof(TS))); @@ -86,7 +86,7 @@ struct SM80_CP_ASYNC_CACHEGLOBAL #if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) TS const* gmem_ptr = &gmem_src; uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); - asm volatile("cp.async.cg.shared.global [%0], [%1], %2;\n" + asm volatile("cp.async.cg.shared.global.L2::128BB [%0], [%1], %2;\n" :: "r"(smem_int_ptr), "l"(gmem_ptr), "n"(sizeof(TS))); @@ -115,7 +115,7 @@ struct SM80_CP_ASYNC_CACHEALWAYS_ZFILL TS const* gmem_ptr = &gmem_src; uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); int src_size = pred ? sizeof(TS) : 0; - asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %3;\n" + asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2, %3;\n" :: "r"(smem_int_ptr), "l"(gmem_ptr), "n"(sizeof(TS)), @@ -145,7 +145,7 @@ struct SM80_CP_ASYNC_CACHEGLOBAL_ZFILL TS const* gmem_ptr = &gmem_src; uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst); int src_size = pred ? sizeof(TS) : 0; - asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %3;\n" + asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n" :: "r"(smem_int_ptr), "l"(gmem_ptr), "n"(sizeof(TS)),