Merge pull request #235 from Peter9606/issue_233_tranpose_update
tranpose.h update based on issue#233
This commit is contained in:
commit
233d69aa6d
@ -26,6 +26,7 @@
|
||||
/*! \file
|
||||
\brief Basic copy routines for tensor views
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
namespace cutlass {
|
||||
namespace transform {
|
||||
@ -70,21 +71,21 @@ struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
|
||||
int a3 = src_int[i3];
|
||||
|
||||
int b0, b1, b2, b3, c0;
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(b0) : "r"(a0), "r"(a1));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(c0) : "r"(a2), "r"(a3));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b0) : "r"(b0), "r"(c0));
|
||||
b0 = __byte_perm(a0, a1, 0x0040);
|
||||
c0 = __byte_perm(a2, a3, 0x0040);
|
||||
b0 = __byte_perm(b0, c0, 0x5410);
|
||||
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(b1) : "r"(a0), "r"(a1));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(c0) : "r"(a2), "r"(a3));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b1) : "r"(b1), "r"(c0));
|
||||
b1 = __byte_perm(a0, a1, 0x0051);
|
||||
c0 = __byte_perm(a2, a3, 0x0051);
|
||||
b1 = __byte_perm(b1, c0, 0x5410);
|
||||
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(b2) : "r"(a0), "r"(a1));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(c0) : "r"(a2), "r"(a3));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b2) : "r"(b2), "r"(c0));
|
||||
b2 = __byte_perm(a0, a1, 0x0062);
|
||||
c0 = __byte_perm(a2, a3, 0x0062);
|
||||
b2 = __byte_perm(b2, c0, 0x5410);
|
||||
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(b3) : "r"(a0), "r"(a1));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(c0) : "r"(a2), "r"(a3));
|
||||
asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b3) : "r"(b3), "r"(c0));
|
||||
b3 = __byte_perm(a0, a1, 0x0073);
|
||||
c0 = __byte_perm(a2, a3, 0x0073);
|
||||
b3 = __byte_perm(b3, c0, 0x5410);
|
||||
|
||||
dst_int[i0] = b0;
|
||||
dst_int[i1] = b1;
|
||||
|
Loading…
Reference in New Issue
Block a user