52 typename AccumulatorsPerThread_,
54 int kScalarsPerLdgA_ = 2,
56 int kScalarsPerLdgB_ = 2>
70 ThreadMultiplyAdd<AccumulatorsPerThread_, Shape<1, 4, 8>, half, half, half>,
94 template <enum MatrixLayout::Kind kLayout_,
typename Iterator_>
97 template <
typename Iterator_>
102 template <
typename Iterator_>
109 template <enum MatrixLayout::Kind kLayout_,
typename Iterator_>
112 template <
typename Iterator_>
117 template <
typename Iterator_>
124 template <enum MatrixLayout::Kind kLayout_,
typename GemmConfig_>
129 template <
typename GemmConfig_>
145 Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
147 GemmConfig_::kScalarsPerLdgA>
155 Shape<GemmConfig_::kStages,
156 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
157 GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
163 128 /
sizeof(half) / GlobalTileTraits::Threads::kW / 2>
171 typename GemmConfig_::OutputTile,
173 typename GemmConfig_::Warps,
175 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
177 typename GemmConfig_::InstructionShape,
179 GemmConfig_::kStages,
183 SharedStoreTileTraits::kSkew>
189 template <enum MatrixLayout::Kind kLayout_,
typename GemmConfig_>
194 template <
typename GemmConfig_>
210 Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
212 GemmConfig_::kScalarsPerLdgB>
220 Shape<GemmConfig_::kStages,
221 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
222 GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
228 128 /
sizeof(half) / GlobalTileTraits::Threads::kW / 2>
236 typename GemmConfig_::OutputTile,
238 typename GemmConfig_::Warps,
240 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
242 typename GemmConfig_::InstructionShape,
244 GemmConfig_::kStages,
248 SharedStoreTileTraits::kSkew>
260 typename OutputTile_,
262 typename EpilogueFunctor_,
266 int kScalarsPerLdgA_ = 2,
268 int kScalarsPerLdgB_ = 2,
270 typename Index_ =
int>
288 typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar,
304 typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar,
313 typedef TileLoadIterator<
typename GemmTileTraitsHelperA::SharedLoadTileTraits,
314 typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar,
321 typedef TileLoadIterator<
typename GemmTileTraitsHelperB::SharedLoadTileTraits,
322 typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar,
354 int kScalarsPerLdgA_ = 2,
356 int kScalarsPerLdgB_ = 2,
358 typename Index_ = int,
364 AccumulatorsPerThread_,
370 typename Helper_::GemmConfig,
372 typename Helper_::GlobalLoadStreamA,
374 typename Helper_::GlobalLoadStreamB,
376 typename Helper_::SharedLoadStreamA,
378 typename Helper_::SharedLoadStreamB,
380 typename Helper_::Epilogue,
382 IdentityBlockSwizzle,
386 typename Helper_::ClearAccumulators> {};
GemmGlobalIteratorAb< typename GemmTileTraitsHelperA::GlobalTileTraits, Index_ > GlobalLoadIteratorA
The iterator to load A from global memory.
Definition: hgemm_traits.h:282
Definition: load_store.h:42
Definition: gemm_shared_tile.h:129
Definition: gemm_shared_tile.h:80
Definition: gemm_epilogue.h:53
Defines iterators for efficiently loading and storing to global memory.
GemmGlobalIteratorAb< typename GemmTileTraitsHelperB::GlobalTileTraits, Index_ > GlobalLoadIteratorB
The iterator to load B from global memory.
Definition: hgemm_traits.h:298
ClearAccumulators< typename MultiplyAdd::ScalarC > ClearAccumulators
The object to clear accumulators.
Definition: hgemm_traits.h:332
Defines structural properties of complete GEMM computation.
TileStoreIterator< typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: hgemm_traits.h:291
GlobalLoadStream< GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: hgemm_traits.h:294
HgemmCrosswiseGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, half const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^N.
Definition: hgemm_traits.h:213
GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > Base
The base config.
Definition: hgemm_traits.h:198
SharedLoadStream< SharedLoadIteratorA > SharedLoadStreamA
The stream to load A from shared memory.
Definition: hgemm_traits.h:319
Definition: hgemm_traits.h:368
Definition: tile_iterator.h:62
Definition: gemm_shared_tile.h:198
TileLoadIterator< typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: hgemm_traits.h:325
Definition: gemm_global_tile.h:159
GemmEpilogue< GemmEpilogueTraits > Epilogue
The epilogue.
Definition: hgemm_traits.h:337
HgemmTransformerA< GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA >::Transformer GlobalTransformerA
The default transformer for A.
Definition: hgemm_traits.h:285
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...
Definition: gemm_global_stream.h:161
Definition: gemm_traits.h:273
Definition: hgemm_traits.h:125
Describes layouts of matrices.
Definition: matrix_traits.h:35
SharedLoadStream< SharedLoadIteratorB > SharedLoadStreamB
The stream to load B from shared memory.
Definition: hgemm_traits.h:327
GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > Base
The base config.
Definition: hgemm_traits.h:133
TileLoadIterator< typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: hgemm_traits.h:317
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:302
SimplifiedGemmEpilogueTraits< GemmConfig, EpilogueFunctor_, Index_ > GemmEpilogueTraits
The traits class for the epilogue.
Definition: hgemm_traits.h:335
Defines iterators for efficiently loading and storing tiles to and from shared memory.
Definition: matrix_traits.h:36
Definition: gemm_traits.h:205
Definition: gemm_shared_stream.h:44
Defines a type for restructuring a tile.
Specialization implementing multiply-add operation on half-precision floating point fragments...
Definition: gemm_traits.h:79
Transposes a tile of 16b elements. Used by HGEMM to construct a K-strided layout in shared memory for...
Definition: gemm_traits.h:137
GemmSharedLoadTileBTraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^N.
Definition: hgemm_traits.h:249
Definition: matrix_traits.h:43
HgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ > GemmConfig
The HGEMM config.
Definition: hgemm_traits.h:274
Definition: hgemm_traits.h:190
GlobalLoadStream< GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: hgemm_traits.h:310
GemmConfig::MultiplyAdd MultiplyAdd
The functor to do the multiply-add in the main loop.
Definition: hgemm_traits.h:330
HgemmTileTraitsHelperB< kLayoutB_, GemmConfig > GemmTileTraitsHelperB
The GEMM config for B.
Definition: hgemm_traits.h:278
Definition: gemm_traits.h:428
Definition: hgemm_global_tile.h:48
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
Definition: gemm_epilogue_traits.h:300
GemmSharedLoadTileATraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^T.
Definition: hgemm_traits.h:184
HgemmTileTraitsHelperA< kLayoutA_, GemmConfig > GemmTileTraitsHelperA
The GEMM config for A.
Definition: hgemm_traits.h:276
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:43
Definition: matrix_traits.h:36
Kind
Definition: matrix_traits.h:36
HgemmTransformerB< GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB >::Transformer GlobalTransformerB
Definition: hgemm_traits.h:301
Definition: gemm_traits.h:278
Definition: hgemm_traits.h:271
HgemmCrosswiseGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, half const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: hgemm_traits.h:148
Tile traits used to construct global tile iterator for HGEMM. This is intended to partition the threa...
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:40
Definition: matrix_traits.h:43
Implements a software-pipelined efficient GEMM.
ReshapeThreads< Tile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:87
Defines structural properties of the GEMM epilogue.
Definition: hgemm_swizzle.h:40
Defines conversion operations among Fragments of different base type.
Definition: hgemm_traits.h:57
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:620
TileStoreIterator< typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: hgemm_traits.h:307