diff --git a/include/cutlass/transform/thread/transpose.h b/include/cutlass/transform/thread/transpose.h index 268e6481..3ce1841a 100644 --- a/include/cutlass/transform/thread/transpose.h +++ b/include/cutlass/transform/thread/transpose.h @@ -26,6 +26,7 @@ /*! \file \brief Basic copy routines for tensor views */ +#pragma once namespace cutlass { namespace transform { @@ -70,21 +71,21 @@ struct Transpose , int8_t> { int a3 = src_int[i3]; int b0, b1, b2, b3, c0; - asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(b0) : "r"(a0), "r"(a1)); - asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(c0) : "r"(a2), "r"(a3)); - asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b0) : "r"(b0), "r"(c0)); + b0 = __byte_perm(a0, a1, 0x0040); + c0 = __byte_perm(a2, a3, 0x0040); + b0 = __byte_perm(b0, c0, 0x5410); - asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(b1) : "r"(a0), "r"(a1)); - asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(c0) : "r"(a2), "r"(a3)); - asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b1) : "r"(b1), "r"(c0)); + b1 = __byte_perm(a0, a1, 0x0051); + c0 = __byte_perm(a2, a3, 0x0051); + b1 = __byte_perm(b1, c0, 0x5410); - asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(b2) : "r"(a0), "r"(a1)); - asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(c0) : "r"(a2), "r"(a3)); - asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b2) : "r"(b2), "r"(c0)); + b2 = __byte_perm(a0, a1, 0x0062); + c0 = __byte_perm(a2, a3, 0x0062); + b2 = __byte_perm(b2, c0, 0x5410); - asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(b3) : "r"(a0), "r"(a1)); - asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(c0) : "r"(a2), "r"(a3)); - asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b3) : "r"(b3), "r"(c0)); + b3 = __byte_perm(a0, a1, 0x0073); + c0 = __byte_perm(a2, a3, 0x0073); + b3 = __byte_perm(b3, c0, 0x5410); dst_int[i0] = b0; dst_int[i1] = b1;