tranpose.h update based on issue#233

1. Add 'pragma once' preprocess directive
 2. Replace prmt PTX with __byte_perm intrinsic

Signed-off-by: Peter Han <fujun.han@iluvatar.ai>
This commit is contained in:
Peter Han 2021-04-14 19:58:00 +08:00
parent 8a3e4b8d02
commit 2142a05d9d

View File

@ -26,6 +26,7 @@
/*! \file
\brief Basic copy routines for tensor views
*/
#pragma once
namespace cutlass {
namespace transform {
@ -70,21 +71,21 @@ struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
int a3 = src_int[i3];
int b0, b1, b2, b3, c0;
asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(b0) : "r"(a0), "r"(a1));
asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(c0) : "r"(a2), "r"(a3));
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b0) : "r"(b0), "r"(c0));
b0 = __byte_perm(a0, a1, 0x0040);
c0 = __byte_perm(a2, a3, 0x0040);
b0 = __byte_perm(b0, c0, 0x5410);
asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(b1) : "r"(a0), "r"(a1));
asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(c0) : "r"(a2), "r"(a3));
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b1) : "r"(b1), "r"(c0));
b1 = __byte_perm(a0, a1, 0x0051);
c0 = __byte_perm(a2, a3, 0x0051);
b1 = __byte_perm(b1, c0, 0x5410);
asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(b2) : "r"(a0), "r"(a1));
asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(c0) : "r"(a2), "r"(a3));
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b2) : "r"(b2), "r"(c0));
b2 = __byte_perm(a0, a1, 0x0062);
c0 = __byte_perm(a2, a3, 0x0062);
b2 = __byte_perm(b2, c0, 0x5410);
asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(b3) : "r"(a0), "r"(a1));
asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(c0) : "r"(a2), "r"(a3));
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b3) : "r"(b3), "r"(c0));
b3 = __byte_perm(a0, a1, 0x0073);
c0 = __byte_perm(a2, a3, 0x0073);
b3 = __byte_perm(b3, c0, 0x5410);
dst_int[i0] = b0;
dst_int[i1] = b1;