31 #include <cuda_fp16.h>    39 template <
typename GlobalIterator_>
    44   typedef typename GlobalIterator::Fragment 
Fragment;
    65     int const* src_int = 
reinterpret_cast<int const*
>(&src[0]);
    66     int* dst_int = 
reinterpret_cast<int*
>(&dst[0]);
    69     for (
int d = 0; d < FragmentShape::kD; ++d) {
    71       int const i0 = 2 * d + 0;
    72       int const i1 = 2 * d + 1;
    78       asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" : 
"=r"(b0) : 
"r"(a0), 
"r"(a1));
    79       asm volatile(
"prmt.b32 %0, %1, %2, 0x7632;" : 
"=r"(b1) : 
"r"(a0), 
"r"(a1));
 GlobalIterator_ GlobalIterator
The global iterator. 
Definition: hgemm_swizzle.h:42
CUTLASS_DEVICE HgemmSwizzle()
The src/dst must be half fragments. 
Definition: hgemm_swizzle.h:60
CUTLASS_DEVICE void transform(Fragment const &src, Fragment &dst)
Transform a fragment. 
Definition: hgemm_swizzle.h:63
Fragment InputFragment
The input fragment. 
Definition: hgemm_swizzle.h:49
Fragment OutputFragment
The output fragment. 
Definition: hgemm_swizzle.h:51
GlobalIterator::Fragment Fragment
The source fragment. 
Definition: hgemm_swizzle.h:44
Defines Fragment, a statically-sized array for storing parts of matrices within a thread's registers...
GlobalIterator::FragmentShape FragmentShape
The shape of the source fragment. 
Definition: hgemm_swizzle.h:46
Compute derived counted of a Layout Concept based class. 
Definition: shape.h:79
Definition: hgemm_swizzle.h:40