enable L2::128B prefetch for cp.async by default (#1177)
This commit is contained in:
parent
1ab6cc7b68
commit
6e60b9b17c
@ -59,7 +59,7 @@ struct SM80_CP_ASYNC_CACHEALWAYS
|
|||||||
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
|
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
|
||||||
TS const* gmem_ptr = &gmem_src;
|
TS const* gmem_ptr = &gmem_src;
|
||||||
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
||||||
asm volatile("cp.async.ca.shared.global [%0], [%1], %2;\n"
|
asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2;\n"
|
||||||
:: "r"(smem_int_ptr),
|
:: "r"(smem_int_ptr),
|
||||||
"l"(gmem_ptr),
|
"l"(gmem_ptr),
|
||||||
"n"(sizeof(TS)));
|
"n"(sizeof(TS)));
|
||||||
@ -86,7 +86,7 @@ struct SM80_CP_ASYNC_CACHEGLOBAL
|
|||||||
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
|
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
|
||||||
TS const* gmem_ptr = &gmem_src;
|
TS const* gmem_ptr = &gmem_src;
|
||||||
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
||||||
asm volatile("cp.async.cg.shared.global [%0], [%1], %2;\n"
|
asm volatile("cp.async.cg.shared.global.L2::128BB [%0], [%1], %2;\n"
|
||||||
:: "r"(smem_int_ptr),
|
:: "r"(smem_int_ptr),
|
||||||
"l"(gmem_ptr),
|
"l"(gmem_ptr),
|
||||||
"n"(sizeof(TS)));
|
"n"(sizeof(TS)));
|
||||||
@ -115,7 +115,7 @@ struct SM80_CP_ASYNC_CACHEALWAYS_ZFILL
|
|||||||
TS const* gmem_ptr = &gmem_src;
|
TS const* gmem_ptr = &gmem_src;
|
||||||
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
||||||
int src_size = pred ? sizeof(TS) : 0;
|
int src_size = pred ? sizeof(TS) : 0;
|
||||||
asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %3;\n"
|
asm volatile("cp.async.ca.shared.global.L2::128B [%0], [%1], %2, %3;\n"
|
||||||
:: "r"(smem_int_ptr),
|
:: "r"(smem_int_ptr),
|
||||||
"l"(gmem_ptr),
|
"l"(gmem_ptr),
|
||||||
"n"(sizeof(TS)),
|
"n"(sizeof(TS)),
|
||||||
@ -145,7 +145,7 @@ struct SM80_CP_ASYNC_CACHEGLOBAL_ZFILL
|
|||||||
TS const* gmem_ptr = &gmem_src;
|
TS const* gmem_ptr = &gmem_src;
|
||||||
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
uint32_t smem_int_ptr = cast_smem_ptr_to_uint(&smem_dst);
|
||||||
int src_size = pred ? sizeof(TS) : 0;
|
int src_size = pred ? sizeof(TS) : 0;
|
||||||
asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %3;\n"
|
asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2, %3;\n"
|
||||||
:: "r"(smem_int_ptr),
|
:: "r"(smem_int_ptr),
|
||||||
"l"(gmem_ptr),
|
"l"(gmem_ptr),
|
||||||
"n"(sizeof(TS)),
|
"n"(sizeof(TS)),
|
||||||
|
Loading…
Reference in New Issue
Block a user