57 typename MultiplyAdd_,
71 int kScalarsPerLdgCAndStgD_,
136 template <enum MatrixLayout::Kind,
typename GemmConfig_>
141 template <
typename GemmConfig_>
147 typedef typename GemmConfig_::ScalarA
Scalar;
164 GemmConfig_::kScalarsPerLdgA>
172 Shape<GemmConfig_::kStages,
173 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
174 GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
178 GemmConfig_::kScalarsPerStsA>
186 typename GemmConfig_::OutputTile,
188 typename GemmConfig_::Warps,
190 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
192 typename GemmConfig_::InstructionShape,
194 GemmConfig_::kStages,
196 GemmConfig_::kScalarsPerLdsA,
204 template <
typename GemmConfig_>
210 typedef typename GemmConfig_::ScalarA
Scalar;
225 Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
227 GemmConfig_::kScalarsPerLdgA>
237 Shape<GemmConfig_::kStages,
238 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
239 GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
243 GemmConfig_::kScalarsPerStsA,
246 GlobalTileTraits::Threads::kW * kScalarsIn4B>
254 typename GemmConfig_::OutputTile,
256 typename GemmConfig_::Warps,
258 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
260 typename GemmConfig_::InstructionShape,
262 GemmConfig_::kStages,
264 GemmConfig_::kScalarsPerLdsA,
266 SharedStoreTileTraits::kSkew>
272 template <enum MatrixLayout::Kind,
typename GemmConfig_>
277 template <
typename GemmConfig_>
283 typedef typename GemmConfig_::ScalarB
Scalar;
298 Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
300 GemmConfig_::kScalarsPerLdgB>
310 Shape<GemmConfig_::kStages,
311 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
312 GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
316 GemmConfig_::kScalarsPerStsB,
319 GlobalTileTraits::Threads::kW * kScalarsIn4B>
327 typename GemmConfig_::OutputTile,
329 typename GemmConfig_::Warps,
331 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
333 typename GemmConfig_::InstructionShape,
335 GemmConfig_::kStages,
337 GemmConfig_::kScalarsPerLdsB,
339 SharedStoreTileTraits::kSkew>
345 template <
typename GemmConfig_>
351 typedef typename GemmConfig_::ScalarB
Scalar;
368 GemmConfig_::kScalarsPerLdgB>
376 Shape<GemmConfig_::kStages,
377 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
378 GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
382 GemmConfig_::kScalarsPerStsB>
390 typename GemmConfig_::OutputTile,
392 typename GemmConfig_::Warps,
394 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
396 typename GemmConfig_::InstructionShape,
398 GemmConfig_::kStages,
400 GemmConfig_::kScalarsPerLdsB,
410 typename GemmConfig_,
412 typename GlobalLoadStreamA_,
414 typename GlobalLoadStreamB_,
416 typename SharedLoadStreamA_,
418 typename SharedLoadStreamB_,
424 typename Index_ = int,
439 typedef typename GlobalLoadStreamA_::Scalar
ScalarA;
446 typedef typename GlobalLoadStreamB_::Scalar
ScalarB;
496 template <
typename GemmDesc_>
505 global_stream_a.initialize(reinterpret_cast<ScalarA const*>(desc.d_a), desc.lda);
512 error_code =
global_stream_b.initialize(reinterpret_cast<ScalarB const*>(desc.d_b), desc.ldb);
524 template <
typename GlobalLoadStream_,
typename SharedLoadStream_>
527 typename GlobalLoadStream_::SharedStorage
global;
529 typename SharedLoadStream_::SharedStorage
shared;
557 shared_storage.main_loop.
stream_a.global,
561 shared_storage.main_loop.
stream_b.global,
598 CUTLASS_DEVICE
void copy(
int step) {
610 CUTLASS_DEVICE
typename SharedLoadStreamA::Fragment
const&
fragment_a(
int step)
const {
615 CUTLASS_DEVICE
typename SharedLoadStreamB::Fragment
const&
fragment_b(
int step)
const {
628 typename SharedLoadStreamA::FetchedFragment
fetched_a[2];
634 typename SharedLoadStreamB::FetchedFragment
fetched_b[2];
641 if (SharedLoadStreamA::Iterator::kRequiresLoadFence ||
642 SharedLoadStreamB::Iterator::kRequiresLoadFence) {
653 template <
typename GemmTileTraitsHelperA_,
typename GemmTileTraitsHelperB_,
typename Index_>
661 typedef TileStoreIterator<
typename GemmTileTraitsHelperA_::SharedStoreTileTraits,
662 typename GemmTileTraitsHelperA_::SharedStoreTileTraits::Scalar,
676 typedef TileStoreIterator<
typename GemmTileTraitsHelperB_::SharedStoreTileTraits,
677 typename GemmTileTraitsHelperB_::SharedStoreTileTraits::Scalar,
686 typedef TileLoadIterator<
typename GemmTileTraitsHelperA_::SharedLoadTileTraits,
687 typename GemmTileTraitsHelperA_::Scalar,
694 typedef TileLoadIterator<
typename GemmTileTraitsHelperB_::SharedLoadTileTraits,
695 typename GemmTileTraitsHelperB_::Scalar,
711 typename GemmConfig_,
715 typename Index_ = int,
727 typename Helper_::GlobalLoadStreamA,
729 typename Helper_::GlobalLoadStreamB,
731 typename Helper_::SharedLoadStreamA,
733 typename Helper_::SharedLoadStreamB,
737 IdentityBlockSwizzle,
741 ClearAccumulators<typename GemmConfig_::Accumulators::Element> > {
Index n
Definition: gemm_traits.h:483
static int const kWarpSize
The default warp size (32 threads per warp).
Definition: gemm_traits.h:104
Epilogue::SharedStorage epilogue
Definition: gemm_traits.h:547
static int const kScalarsPerStsA
Definition: gemm_traits.h:110
GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^N.
Definition: gemm_traits.h:340
ScalarA_ ScalarA
The scalar for A.
Definition: gemm_traits.h:82
GlobalLoadStreamA_ GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: gemm_traits.h:435
GlobalStoreIteratorD::Scalar ScalarD
The scalar for D.
Definition: gemm_epilogue.h:98
MultiplyAdd_ MultiplyAdd
The functor to do D = A*B + C.
Definition: gemm_traits.h:93
static int const kAccumulatorsPerLdsA
The number of accumulators that are going to be fed from one LDS A/B.
Definition: gemm_traits.h:127
Definition: load_store.h:42
static int const kScalarsPerLdsA
Definition: gemm_traits.h:111
SharedLoadStreamA_ SharedLoadStreamA
The iterator for A to load from shared memory.
Definition: gemm_traits.h:449
MultiplyAdd::InstructionShape InstructionShape
The shape of the instruction.
Definition: gemm_traits.h:95
SharedLoadStreamA::Params shared_stream_a
The params for the A stream from shared memory.
Definition: gemm_traits.h:489
Definition: gemm_shared_tile.h:129
GlobalLoadStreamB_ GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: gemm_traits.h:442
Definition: gemm_shared_tile.h:80
CUTLASS_DEVICE void inc_stage()
Increment the stage.
Definition: gemm_traits.h:620
TileStoreIterator< typename GemmTileTraitsHelperA_::SharedStoreTileTraits, typename GemmTileTraitsHelperA_::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: gemm_traits.h:665
static int const kScalarsPerLdsB
Definition: gemm_traits.h:116
Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.
Epilogue::ScalarD ScalarD
Definition: gemm_traits.h:471
The storage in shared memory.
Definition: gemm_traits.h:543
SharedLoadStream< SharedLoadIteratorB > SharedLoadStreamB
The stream to load B from shared memory.
Definition: gemm_traits.h:700
Index k
Definition: gemm_traits.h:483
Definition: gemm_traits.h:525
Definition: gemm_global_tile.h:70
SharedLoadStreamA::FetchedFragment fetched_a[2]
The fragments to fetch A.
Definition: gemm_traits.h:628
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:241
GemmConfig_::ScalarB Scalar
The input scalar.
Definition: gemm_traits.h:283
GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for B^T.
Definition: gemm_traits.h:383
SharedLoadStreamB_ SharedLoadStreamB
The iterator for B to load from shared memory.
Definition: gemm_traits.h:451
static int const kScalarsPerStgD
The number of scalars per STS/LDS/STG for D.
Definition: gemm_traits.h:122
CUTLASS_DEVICE void copy(int step)
Trigger the copies from shared memory to registers.
Definition: gemm_traits.h:598
GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^N.
Definition: gemm_traits.h:301
A template defining Fragment Concept.
Definition: fragment.h:99
SharedLoadStreamA stream_a
The stream for A.
Definition: gemm_traits.h:626
SharedLoadStream< SharedLoadIteratorA > SharedLoadStreamA
The stream to load A from shared memory.
Definition: gemm_traits.h:692
Definition: gemm_shared_tile.h:38
ScalarC_ ScalarC
The scalar for C.
Definition: gemm_traits.h:86
CUTLASS_DEVICE void copy()
Trigger the copies from shared memory to registers.
Definition: gemm_traits.h:566
GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, 0 > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^N.
Definition: gemm_traits.h:199
Epilogue_ Epilogue
The epilogue.
Definition: gemm_traits.h:468
GlobalLoadStreamA_::Scalar ScalarA
The scalar for A.
Definition: gemm_traits.h:439
Definition: tile_iterator.h:62
GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^N.
Definition: gemm_traits.h:165
ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
The number of warps.
Definition: gemm_traits.h:102
GemmConfig_::ScalarA Scalar
The input scalar.
Definition: gemm_traits.h:147
Definition: gemm_shared_tile.h:198
GlobalLoadStreamB::SharedStoreStorage SharedStoreStorageB
The shared storage for B.
Definition: gemm_traits.h:457
Definition: gemm_global_tile.h:159
Epilogue::ScalarC ScalarC
The scalars in the epilogue.
Definition: gemm_traits.h:470
GlobalLoadStream< GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: gemm_traits.h:683
SharedLoadStreamB stream_b
The stream for B.
Definition: gemm_traits.h:632
Assemble the shared load stream for A/B.
Definition: gemm_traits.h:590
GlobalLoadStreamB stream_b
The stream for B.
Definition: gemm_traits.h:586
GemmConfig::MultiplyAdd MultiplyAdd
The multiply-add functor.
Definition: gemm_traits.h:463
static CUTLASS_DEVICE void shared_load_fence(bool in_loop)
The memory fence for shared loads.
Definition: gemm_traits.h:640
GemmConfig_ GemmConfig
The configuration.
Definition: gemm_traits.h:430
Definition: gemm_global_stream.h:161
SharedLoadStreamB::TransformedFragment transformed_b[2]
The fragments to transform B.
Definition: gemm_traits.h:636
Definition: gemm_traits.h:273
GlobalLoadStreamA stream_a
The stream for A.
Definition: gemm_traits.h:584
GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^T.
Definition: gemm_traits.h:267
Definition: clear_accumulators.h:38
StreamSharedStorage< GlobalLoadStreamB, SharedLoadStreamB > stream_b
Definition: gemm_traits.h:537
The params.
Definition: gemm_traits.h:481
static int const kScalarsPerLdgA
The number of scalars per LDG/STS/LDS for A.
Definition: gemm_traits.h:109
CUTLASS_DEVICE SharedLoadStreamB::Fragment const & fragment_b(int step) const
The fragment B.
Definition: gemm_traits.h:615
Copy< typename GlobalLoadIteratorB::Fragment > GlobalTransformerB
The data converter for B before storing to shared memory.
Definition: gemm_traits.h:674
GemmConfig_::ScalarB Scalar
The input scalar.
Definition: gemm_traits.h:351
Describes layouts of matrices.
Definition: matrix_traits.h:35
GemmGlobalIteratorAb< typename GemmTileTraitsHelperB_::GlobalTileTraits, Index_ > GlobalLoadIteratorB
The global iterator to load B from global memory.
Definition: gemm_traits.h:672
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:302
Definition: gemm_traits.h:533
Definition: matrix_traits.h:36
CUTLASS_DEVICE void residue(Index k, bool skip_clear=false)
Execute the residue code.
Definition: gemm_traits.h:578
MultiplyAdd::Accumulators Accumulators
The accumulators.
Definition: gemm_traits.h:99
ClearAccumulators_ ClearAccumulators
Clear the accumulators.
Definition: gemm_traits.h:478
Definition: gemm_shared_stream.h:44
GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: gemm_traits.h:228
Defines a type for restructuring a tile.
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...
Shape< A_::kD/B_::kD, A_::kH/B_::kH, A_::kW/B_::kW, A_::kC/B_::kC > Shape
Definition: shape.h:126
static int const kScalarsPerStsB
Definition: gemm_traits.h:115
Defines abstractions for efficiently clearing accumulator tiles.
Definition: gemm_traits.h:79
Assemble the global load streams for A/B.
Definition: gemm_traits.h:551
static int const kScalarsPerStsD
Definition: gemm_traits.h:123
static CUTLASS_DEVICE void shared_store_fence(bool in_loop)
The memory fence for shared stores.
Definition: gemm_traits.h:648
GemmConfig_::ScalarA Scalar
The input scalar.
Definition: gemm_traits.h:210
Definition: gemm_traits.h:137
CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const &desc)
Initialize the parameters.
Definition: gemm_traits.h:497
GlobalLoadStream_::SharedStorage global
Definition: gemm_traits.h:527
Definition: matrix_traits.h:43
Definition: identity_block_swizzle.h:37
GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for A^N.
Definition: gemm_traits.h:179
ScalarB_ ScalarB
The scalar for B.
Definition: gemm_traits.h:84
GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:353
GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:285
GlobalLoadStreamB_::Scalar ScalarB
The scalar for B.
Definition: gemm_traits.h:446
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
GlobalLoadStreamA::SharedStoreStorage SharedStoreStorageA
The shared storage for A.
Definition: gemm_traits.h:454
GlobalLoadStream< GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: gemm_traits.h:668
Definition: gemm_traits.h:428
MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
The number of accumulators per warp.
Definition: gemm_traits.h:97
SharedLoadStreamA::TransformedFragment transformed_a[2]
The fragments to transform A.
Definition: gemm_traits.h:630
SharedLoadStream_::SharedStorage shared
Definition: gemm_traits.h:529
GlobalLoadStreamB::Params global_stream_b
The params for the B stream.
Definition: gemm_traits.h:487
SharedLoadStreamB::FetchedFragment fetched_b[2]
The fragments to fetch B.
Definition: gemm_traits.h:634
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
static int const kScalarsPerLdgC
The number of scalars per LDG for C.
Definition: gemm_traits.h:119
ScalarD_ ScalarD
The scalar for D.
Definition: gemm_traits.h:88
static int const kThreads
The numnber of threads.
Definition: gemm_traits.h:106
Defies functors for mapping blockIdx to partitions of the GEMM computation.
Index m
The dimensions of the GEMM.
Definition: gemm_traits.h:483
BlockSwizzle_ BlockSwizzle
The block swizzle to reorganize the grid.
Definition: gemm_traits.h:474
TileLoadIterator< typename GemmTileTraitsHelperA_::SharedLoadTileTraits, typename GemmTileTraitsHelperA_::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: gemm_traits.h:690
Definition: matrix_traits.h:36
TileLoadIterator< typename GemmTileTraitsHelperB_::SharedLoadTileTraits, typename GemmTileTraitsHelperB_::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: gemm_traits.h:698
CUTLASS_DEVICE SharedLoadStream(Params const ¶ms, SharedStorage &shared_storage)
Ctor.
Definition: gemm_traits.h:592
CUTLASS_DEVICE GlobalLoadStream(Params const ¶ms, SharedStorage &shared_storage, dim3 const &block)
Ctor.
Definition: gemm_traits.h:553
GlobalLoadIteratorC::Scalar ScalarC
The scalar for C.
Definition: gemm_epilogue.h:96
Index_ Index
The index.
Definition: gemm_traits.h:476
GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:149
TileStoreIterator< typename GemmTileTraitsHelperB_::SharedStoreTileTraits, typename GemmTileTraitsHelperB_::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: gemm_traits.h:680
Epilogue::Params epilogue
The params for the epilogue.
Definition: gemm_traits.h:493
Kind
Definition: matrix_traits.h:36
GlobalLoadStreamA::Params global_stream_a
The params for the A stream.
Definition: gemm_traits.h:485
The shared storage.
Definition: clear_accumulators.h:40
CUTLASS_DEVICE void commit(int step)
Commit the data.
Definition: gemm_traits.h:604
static int const kScalarsPerLdsD
Definition: gemm_traits.h:124
Implements efficient loading of the thread block-level tile from global memory and storing to shared ...
MainLoopSharedStorage main_loop
Definition: gemm_traits.h:545
static MatrixLayout::Kind const kLayoutA
The layout of A.
Definition: gemm_traits.h:437
OutputTile_ OutputTile
The tile.
Definition: gemm_traits.h:91
static int const kScalarsPerLdgB
The number of scalars per LDG/STS/LDS for B.
Definition: gemm_traits.h:114
Definition: matrix_traits.h:43
Definition: gemm_traits.h:654
ReshapeThreads< Tile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:87
GemmGlobalIteratorAb< typename GemmTileTraitsHelperA_::GlobalTileTraits, Index_ > GlobalLoadIteratorA
The global iterator to load A from global memory.
Definition: gemm_traits.h:657
GemmConfig::OutputTile OutputTile
The output tile.
Definition: gemm_traits.h:432
Defines properties of matrices used to denote layout and operands to GEMM kernels.
Copy< typename GlobalLoadIteratorA::Fragment > GlobalTransformerA
The data converter for A before storing to shared memory.
Definition: gemm_traits.h:659
CUTLASS_DEVICE void commit()
Commit the data.
Definition: gemm_traits.h:572
GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, 0 > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^T.
Definition: gemm_traits.h:403
ClearAccumulators::SharedStorage clear
Definition: gemm_traits.h:539
StreamSharedStorage< GlobalLoadStreamA, SharedLoadStreamA > stream_a
Definition: gemm_traits.h:535
GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^T.
Definition: gemm_traits.h:369
Defines abstractions for managing loading and storing fragments to shared memory in the efficient GEM...
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79
Defines conversion operations among Fragments of different base type.
SharedLoadStreamB::Params shared_stream_b
The params for the B stream from shared memory.
Definition: gemm_traits.h:491
Definition: gemm_traits.h:723
CUTLASS_DEVICE SharedLoadStreamA::Fragment const & fragment_a(int step) const
The fragment A.
Definition: gemm_traits.h:610
static MatrixLayout::Kind const kLayoutB
The layout of B.
Definition: gemm_traits.h:444
static int const kAccumulatorsPerLdsB
Definition: gemm_traits.h:128
static int const kStages
The number of stages in shared memory to implement double, triple, more-buffering.
Definition: gemm_traits.h:131
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:620
ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: thread_multiply_add.h:51
GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar
The scalar stored in shared memory.
Definition: gemm_traits.h:212